提交 9048b142 编写于 作者: L LiuChiaChi

solve conflicts

...@@ -4,7 +4,7 @@ endif() ...@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu") SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
...@@ -62,9 +62,9 @@ function(op_library TARGET) ...@@ -62,9 +62,9 @@ function(op_library TARGET)
endif() endif()
endif() endif()
if(WITH_XPU) if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}") string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif() endif()
endif() endif()
else() else()
...@@ -83,7 +83,7 @@ function(op_library TARGET) ...@@ -83,7 +83,7 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$") elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src}) list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
......
...@@ -270,6 +270,10 @@ if(WITH_PSLIB) ...@@ -270,6 +270,10 @@ if(WITH_PSLIB)
endif() endif()
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
include(external/box_ps) include(external/box_ps)
...@@ -277,10 +281,6 @@ if(WITH_BOX_PS) ...@@ -277,10 +281,6 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS) endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC) if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc) list(APPEND third_party_deps extern_grpc)
......
...@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal " "The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is " "to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.", "%d and the number of places is %d.",
in_var_handles.size(), num_places)); in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
...@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The number of local scopes should be equal " "The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is " "to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.", "%d and the number of places is %d.",
in_var_handles.size(), num_places)); in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() { ...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ( "The number of inputs should be 1, but got %d.",
out_var_handles.size(), places_.size(), in_var_handles.size()));
"The number of output should equal to the number of places."); PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
platform::errors::PreconditionNotMet(
"The number of outputs and the number of places should "
"be equal, but got the number of outputs is %d and the "
"number of places is %d.",
out_var_handles.size(), places_.size()));
VarHandle *in_var_handle = in_var_handles[0]; VarHandle *in_var_handle = in_var_handles[0];
...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
const std::vector<Scope *> &var_scopes) { const std::vector<Scope *> &var_scopes) {
auto *in_var = auto *in_var =
var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
in_var_handle.name()));
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) { if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!"; VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
broadcast_calls.emplace_back( broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] { [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type), send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream())); root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
nccl_ctxs_->DevCtx(p)->Wait(); nccl_ctxs_->DevCtx(p)->Wait();
} }
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} }
} }
...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue( ...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
auto t_out_p = out_var_handle->place(); auto t_out_p = out_var_handle->place();
auto *out_var = var_scopes.at(out_var_handle->scope_idx()) auto *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) { if (is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
......
...@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle { ...@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle { ...@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle { ...@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
float val_scalar = 0.0) { float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto lod_tensor = var->GetMutable<f::LoDTensor>(); auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) { for (size_t k = 0; k < send_vector.size(); ++k) {
...@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle { ...@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
} }
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto selected_rows = var->GetMutable<f::SelectedRows>(); auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value(); auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]); value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
...@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle { ...@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
const std::vector<float>& send_vector, const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) { const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto& selected_rows = var->Get<f::SelectedRows>(); auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value(); auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(selected_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %ld.",
height, selected_rows.height()));
for (size_t k = 0; k < selected_rows.rows().size(); ++k) { for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); PADDLE_ENFORCE_EQ(
selected_rows.rows()[k], rows[k],
platform::errors::InvalidArgument(
"The item at position %zu of rows of SelectedRows "
"is not equal to the expected, expect %ld, but got %ld.",
k, rows[k], selected_rows.rows()[k]));
} }
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
...@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle { ...@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
framework::Scope* scope) { framework::Scope* scope) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
auto var = scope->FindVar(varname); auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto tensor = var->Get<f::LoDTensor>(); auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); PADDLE_ENFORCE_EQ(tensor.lod(), lod,
platform::errors::InvalidArgument(
"The LoD of tensor is not equal to "
"the expected, expect %s, but got %s.",
lod, tensor.lod()));
f::Tensor result_tensor; f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor); f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place); float* ct = result_tensor.mutable_data<float>(cpu_place);
......
...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("reduce_mode_multi_devices_pass").get(); AppendPass("reduce_mode_multi_devices_pass").get();
break; break;
default: default:
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW(
platform::errors::Unimplemented("Unknown reduce strategy."));
} }
} }
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include <memory> #include <memory>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device); BOOST_GET_CONST(platform::CUDAPlace, place).device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_ENFORCE_NOT_NULL(event_); cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
} }
} }
#endif #endif
PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( PADDLE_ENFORCE_NE(vars.empty(), true,
"Variable names are empty.")); platform::errors::InvalidArgument(
"The variables to be deleted are empty."));
for (auto *var : var_infos_) { for (auto *var : var_infos_) {
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
"The memory optimization info is NULL."));
} }
} }
...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
if (event_) { if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device); platform::CUDADeviceGuard guard(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
} }
#endif #endif
} }
...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() { ...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
} }
void EagerDeletionOpHandle::CallOnce() { void EagerDeletionOpHandle::CallOnce() {
PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here"); PADDLE_ENFORCE_EQ(
vars_.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted should be initialized here."));
Scope *exec_scope = local_exec_scopes_[0]; Scope *exec_scope = local_exec_scopes_[0];
for (auto *var_info : var_infos_) { for (auto *var_info : var_infos_) {
auto *var = exec_scope->FindVar(var_info->Name()); auto *var = exec_scope->FindVar(var_info->Name());
PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(
var_info->Name()); var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
var_info->Name()));
vars_.emplace_back(var); vars_.emplace_back(var);
} }
} }
...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
garbages.emplace_back(t.MoveMemoryHolder()); garbages.emplace_back(t.MoveMemoryHolder());
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW(platform::errors::Unimplemented(
framework::ToTypeName(var->Type()), var_info->Name()); "The variable(%s) of type %s is not supported in eager deletion.",
framework::ToTypeName(var->Type()), var_info->Name()));
} }
} }
...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages( ...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
auto callback_stream = auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream(); reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() { auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
}; };
gc_->Add(std::move(*garbages), callback_func); gc_->Add(std::move(*garbages), callback_func);
} else { } else {
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_, in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of places plus the number of all reduce handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of all reduce handles "
"is %d.",
in_var_handles.size(), place_num, num_of_all_reduce_));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of output variable handles, but got the number of input variable "
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of // Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused. // those op are in CPUPlace, in this case, the all reduce should not be fused.
...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
dtype = ele_dtype; dtype = ele_dtype;
} }
PADDLE_ENFORCE_EQ(ele_dtype, dtype); PADDLE_ENFORCE_EQ(
ele_dtype, dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(dtype)));
// Check whether the address space is contiguous. // Check whether the address space is contiguous.
std::sort( std::sort(
...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
"input[%d] address: 0X%02x. The offset: %d", "input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset); next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address, PADDLE_ENFORCE_EQ(
"The address is not consistent."); infer_next_address, next_address,
platform::errors::InvalidArgument(
"The infered address of the next tensor should be equal to the "
"real address of the next tensor. But got infered address is %p "
"and real address is %p.",
infer_next_address, next_address));
} }
} }
if (!FLAGS_skip_fused_all_reduce_check) { if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) { for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, PADDLE_ENFORCE_EQ(
grads_tensor.at(scope_idx).at(j).first); grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first,
platform::errors::InvalidArgument(
"The variable name of grad tensors of "
"fused_all_reduce_op_handle "
"must be consistent. The current name is %s, but the "
"previous name is %s.",
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first));
} }
} }
} }
...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( ...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true; return true;
...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( ...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
size_t place_num = places_.size(); size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); PADDLE_ENFORCE_EQ(
var_name, out_var_handles[j]->name(),
platform::errors::InvalidArgument(
"The name of input variable should be equal "
"to the name of output variable. But got the name of input "
"variable is %s and the name of output variable is %s.",
var_name, out_var_handles[j]->name()));
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)), platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
true, "%s(%d) is not in the right place.", var_name, scope_idx); true, platform::errors::InvalidArgument(
"The variable '%s' at scope %d is not in the right place.",
var_name, scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
} }
} }
...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
size_t size_of_dtype = 0; size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) { for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype // Get dtype
auto ele_type = grad_tensor.at(i).second->type(); auto ele_dtype = grad_tensor.at(i).second->type();
if (i == 0) { if (i == 0) {
*dtype = ele_type; *dtype = ele_dtype;
size_of_dtype = framework::SizeOfType(ele_type); size_of_dtype = framework::SizeOfType(ele_dtype);
} }
PADDLE_ENFORCE_EQ(ele_type, *dtype); PADDLE_ENFORCE_EQ(
ele_dtype, *dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
// Get element number // Get element number
int64_t len = grad_tensor.at(i).second->numel(); int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0); PADDLE_ENFORCE_GT(
len, 0, platform::errors::InvalidArgument(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel += *numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() { ...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
WaitInputVarGenerated(); WaitInputVarGenerated();
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); PADDLE_ENFORCE_EQ(
in_var_handles.size() * place_num, out_var_handles.size(),
platform::errors::PreconditionNotMet(
"The number of input variable handles plus the number "
"of places should be equal to the number of output variable handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of output variable handles "
"is %d.",
in_var_handles.size(), place_num, out_var_handles.size()));
for (size_t i = 0; i < in_var_handles.size(); ++i) { for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar( BroadcastOneVar(
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
...@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not supported."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() { ...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of input variables should be equal "
"to the number of places, but got the number of input variables is "
"%d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(
"The number of output should be one."); out_var_handles.size(), 1,
platform::errors::InvalidArgument(
"The number of output variables should be 1, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() { ...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
auto in_0_handle = in_var_handles[0]; auto in_0_handle = in_var_handles[0];
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var); PADDLE_ENFORCE_NOT_NULL(
pre_in_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
in_0_handle->name()));
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(), PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
"Currently, gather_op only can gather SelectedRows."); platform::errors::Unimplemented(
"Currently, gather_op only supports SelectedRows."));
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(); WaitInputVarGenerated();
...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() { ...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
for (auto *in_handle : in_var_handles) { for (auto *in_handle : in_var_handles) {
auto *in_var = auto *in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var,
platform::errors::NotFound(
"The variable '%s' is not found in the scope.", in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
auto &in_sr_value = in_var->Get<framework::SelectedRows>(); auto &in_sr_value = in_var->Get<framework::SelectedRows>();
...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() { ...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform::Place t_out_p = out_var_handle->place(); platform::Place t_out_p = out_var_handle->place();
if (platform::is_gpu_place(pre_in_value.place())) { if (platform::is_gpu_place(pre_in_value.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
out_var_handle->name()));
auto out_value = out_var->GetMutable<framework::SelectedRows>(); auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height()); out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows); out_value->set_rows(out_rows);
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
namespace paddle { namespace paddle {
...@@ -60,7 +62,8 @@ struct TestGatherOpHandle { ...@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new p::CUDADeviceContext(p));
} }
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -141,7 +144,9 @@ struct TestGatherOpHandle { ...@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -155,7 +160,9 @@ struct TestGatherOpHandle { ...@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
} }
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out"); auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
...@@ -173,9 +180,19 @@ struct TestGatherOpHandle { ...@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
auto& out_select_rows = out_var->Get<f::SelectedRows>(); auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) { ...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
test_op.TestGatherSelectedRows(input_scope_idx); test_op.TestGatherSelectedRows(input_scope_idx);
} }
#endif #endif
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
virtual ~NCCLOpHandleBase() { virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) { for (auto& ev : inter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
for (auto& ev : exter_events_) { for (auto& ev : exter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
} }
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0"); PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
run_order_ = run_order; run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce; use_hierarchical_allreduce_ = use_hierarchical_allreduce;
...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
return; return;
} }
PADDLE_ENFORCE(places_.size() == 1, PADDLE_ENFORCE_EQ(places_.size(), 1,
"HierarchicalAllReduce run one proc with one card mode."); platform::errors::InvalidArgument(
"HierarchicalAllReduce can only run "
"one proccess with one card mode, but got %d cards.",
places_.size()));
for (auto& p : places_) { for (auto& p : places_) {
auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order); auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
continue; continue;
} }
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &exter_events_[dev_id], cudaEventDisableTiming));
VLOG(10) << "Create events on dev_id:" << dev_id VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id] << ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id]; << ", exter_event:" << &exter_events_[dev_id];
...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void FlatNCCLAllReduce(platform::Place place, const void* sendbuff, void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype << ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place; << ", place:" << place;
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
} }
void NCCLAllReduce(platform::Place place, const void* sendbuff, void NCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
if (!use_hierarchical_allreduce_) { if (!use_hierarchical_allreduce_) {
FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return; return;
...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void HierarchicalAllReduce(platform::Place place, const void* sendbuff, void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) { ncclDataType_t datatype, ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
InterReduce(place, sendbuff, recvbuff, count, datatype, op); InterReduce(place, sendbuff, recvbuff, count, datatype, op);
// When a trainer is not in exter allreduce ring // When a trainer is not in exter allreduce ring
// they need not to call this. // they need not to call this.
...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dtype:" << datatype << ", place:" << place << ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream; << ", stream:" << stream;
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
cudaEventRecord(inter_events_.at(dev_id), stream); cudaEventRecord(inter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce inter stream error");
} }
} }
...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_); PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
cudaEventRecord(exter_events_.at(dev_id), stream); cudaEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce exter stream error");
} }
} }
...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", stream:" << stream; << ", stream:" << stream;
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0, PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
comm, stream)); sendbuff, count, datatype, 0, comm, stream));
} }
protected: protected:
......
...@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() { ...@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
...@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() { ...@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
} }
} }
} else { } else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, PADDLE_ENFORCE_EQ(
"%s should have only one dev_ctx.", Name()); dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first; auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) { if (out_var_handle) {
PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()), PADDLE_ENFORCE_EQ(
"The place of output(%s) is not consistent with the " platform::is_same_place(place, out_var_handle->place()), true,
"place of current op(%s).", platform::errors::InvalidArgument(
out_var_handle->Name(), Name()); "The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name()));
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
...@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
InitCUDA(); InitCUDA();
} }
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE_EQ(use_cuda, false,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
#endif #endif
// skip running current op, used with inplace_addto_op_pass // skip running current op, used with inplace_addto_op_pass
...@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx); PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) { for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second); PADDLE_ENFORCE_NOT_NULL(
dev_ctx.second,
platform::errors::InvalidArgument("The device context is NULL."));
dev_ctx.second->Wait(); dev_ctx.second->Wait();
} }
} else { } else {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream(); static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) { for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
} }
} }
#else #else
...@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() { ...@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place)) static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place())) dev_ctxes_.at(in_var_handle->place()))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes( ...@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
auto scopes = GetLocalScopes(); auto scopes = GetLocalScopes();
for (auto *scope : scopes) { for (auto *scope : scopes) {
auto iter = scope_map.find(scope); auto iter = scope_map.find(scope);
PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found"); PADDLE_ENFORCE_NE(
iter, scope_map.end(),
platform::errors::NotFound("Local scope not found in scope map."));
local_exec_scopes_.emplace_back(iter->second); local_exec_scopes_.emplace_back(iter->second);
} }
} }
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> { ...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
PADDLE_ENFORCE_EQ(info->proto_, nullptr, PADDLE_ENFORCE_EQ(info->proto_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpProto of %s has been registered", op_type)); "OpProto of %s has been registered.", op_type));
PADDLE_ENFORCE_EQ(info->checker_, nullptr, PADDLE_ENFORCE_EQ(info->checker_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpAttrChecker of %s has been registered", op_type)); "OpAttrChecker of %s has been registered.", op_type));
info->proto_ = new proto::OpProto; info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker(); info->checker_ = new OpAttrChecker();
T maker; T maker;
maker(info->proto_, info->checker_); maker(info->proto_, info->checker_);
info->proto_->set_type(op_type); info->proto_->set_type(op_type);
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ(
info->proto_->IsInitialized(), info->proto_->IsInitialized(), true,
"Fail to initialize %s's OpProto, because %s is not initialized", platform::errors::PreconditionNotMet(
op_type, info->proto_->InitializationErrorString()); "Fail to initialize %s's OpProto, because %s is not initialized.",
op_type, info->proto_->InitializationErrorString()));
} }
}; };
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -32,9 +33,13 @@ struct ReduceLoDTensor { ...@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
template <typename T> template <typename T>
void apply() const { void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty()); PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
platform::errors::InvalidArgument(
"The number of tensors to be reduced is 0."));
auto &t0 = *src_tensors_[0]; auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0); PADDLE_ENFORCE_NE(t0.numel(), 0,
platform::errors::InvalidArgument(
"The size of first tensor to be reduced is 0."));
dst_tensor_.Resize(t0.dims()); dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace()); T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
...@@ -45,8 +50,19 @@ struct ReduceLoDTensor { ...@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
continue; continue;
} }
PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
PADDLE_ENFORCE_EQ(t.type(), t0.type()); platform::errors::InvalidArgument(
"The shape of tensors to be reduced must be "
"consistent. The shape of current tensor is %s, "
"but the shape of the first tensor is %s.",
t.dims(), t0.dims()));
PADDLE_ENFORCE_EQ(t.type(), t0.type(),
platform::errors::InvalidArgument(
"The type of tensors to be reduced must be "
"consistent. The type of current tensor is %s, "
"but the type of the first tensor is %s.",
t.type(), t0.type()));
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst, std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; }); [](T a, T b) -> T { return a + b; });
} }
...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor { ...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
in_places_(in_places), in_places_(in_places),
out_place_(out_place), out_place_(out_place),
dst_selected_rows_(dst_selected_rows) { dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false); PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
platform::errors::InvalidArgument(
"The number of selected_rows to be gathered is 0."));
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
merged_dev_ctx->Wait(); merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name}); scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(remote.size() == vars.size()); client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows. // 4. merged local selected rows.
std::vector<const SelectedRows *> all; std::vector<const SelectedRows *> all;
...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() { ...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of inputs should equal to the number of places, but got "
"the number of inputs is %d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one."); platform::errors::InvalidArgument(
"The number of output should be one, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
"Variable %s is not found in scope.",
in_0_handle->name()));
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx std::vector<platform::Place> in_places; // used to get dev_ctx
...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() { ...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
in_places.emplace_back(in_handle->place()); in_places.emplace_back(in_handle->place());
auto in_var = auto in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound("Variable %s is not found in scope.",
out_var_handle->name()));
// NOTE: The tensors' Place of input and output must be all on GPU or all on // NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU. // CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p; platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) { if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
t_out_p = out_var_handle->place(); t_out_p = out_var_handle->place();
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
} else { } else {
PADDLE_THROW("only support double or float when gather SelectedRows"); PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
} }
#endif #endif
}); });
...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
size_t numel = static_cast<size_t>(lod_tensor.numel()); size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back( all_reduce_calls.emplace_back(
[buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type), buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() { ...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
lod_tensors[0]->place()));
} }
} }
} }
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -69,7 +71,8 @@ struct TestReduceOpHandle { ...@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -103,7 +106,8 @@ struct TestReduceOpHandle { ...@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -164,7 +168,10 @@ struct TestReduceOpHandle { ...@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -178,7 +185,9 @@ struct TestReduceOpHandle { ...@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -196,9 +205,18 @@ struct TestReduceOpHandle { ...@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
auto &out_select_rows = out_var->Get<f::SelectedRows>(); auto &out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -208,7 +226,7 @@ struct TestReduceOpHandle { ...@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
} }
} } // namespace details
void TestReduceLodTensors(size_t output_scope_idx) { void TestReduceLodTensors(size_t output_scope_idx) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
...@@ -220,7 +238,9 @@ struct TestReduceOpHandle { ...@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>(); auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod); in_lod_tensor->set_lod(lod);
...@@ -230,7 +250,9 @@ struct TestReduceOpHandle { ...@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>(); auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -254,7 +276,7 @@ struct TestReduceOpHandle { ...@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
} }
} }
}; }; // namespace details
TEST(ReduceTester, TestCPUReduceTestSelectedRows) { TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op; TestReduceOpHandle test_op;
......
...@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() { ...@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() {
auto *out_var = exec_scope_->FindVar(out_var_names_[i]); auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound( in_var, platform::errors::NotFound(
"The input variable(%s)to be inplaced should not be NULL.", "The variable(%s) to be inplaced is not found in scope.",
in_var_infos_[i]->Name())); in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_var, out_var, platform::errors::NotFound(
platform::errors::NotFound( "The variable(%s) to be inplaced is not found in scope.",
"The output variable(%s) to be inplaced should not be NULL.", out_var_names_[i]));
out_var_names_[i]));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in_var, out_var, in_var, out_var,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "dgc/dgc.h" #include "dgc/dgc.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( ...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_(is_encoded), is_encoded_(is_encoded),
nranks_(nranks) { nranks_(nranks) {
// TODO(gongwb) :polish them! // TODO(gongwb) :polish them!
PADDLE_ENFORCE_EQ(is_encoded, true); PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
"The argument is_encoded is false."));
VLOG(1) << "Use dgc allreduce mode" VLOG(1) << "Use dgc allreduce mode"
<< ", nranks:" << nranks_; << ", nranks:" << nranks_;
PADDLE_ENFORCE_GT(local_scopes_.size(), 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto nranks_name = g_dgc_nranks; auto nranks_name = g_dgc_nranks;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *local_scope = local_scopes_[i]; auto *local_scope = local_scopes_[i];
auto nranks_var = local_scope->FindVar(nranks_name); auto nranks_var = local_scope->FindVar(nranks_name);
if (nranks_var == nullptr) {
PADDLE_THROW("not find nranks_var:%s", nranks_name); PADDLE_ENFORCE_NOT_NULL(
} nranks_var, platform::errors::NotFound(
"Variable %s is not found in scope.", nranks_name));
float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>(); float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
*dgc_nranks = nranks; *dgc_nranks = nranks;
...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"places, but got the number of input variables is %zu and the the "
"number of places is %zu.",
in_var_handles.size(), places_.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"output variables, but got the number of input variables is %zu and "
"the the number of output variables is %zu.",
in_var_handles.size(), out_var_handles.size()));
std::vector<const LoDTensor *> ins; std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> gathers; std::vector<LoDTensor *> gathers;
...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto encode_var_name = original_name + g_dgc_encoded; auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name); auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
encode_var_name));
auto &in = in_var->Get<LoDTensor>(); auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in); ins.emplace_back(&in);
auto gather_var_name = original_name + g_dgc_gather; auto gather_var_name = original_name + g_dgc_gather;
auto *gather_var = local_scope->FindVar(gather_var_name); auto *gather_var = local_scope->FindVar(gather_var_name);
PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null", PADDLE_ENFORCE_NOT_NULL(
gather_var_name); gather_var, platform::errors::NotFound(
"Variable %s is not found in scope.", gather_var));
auto *gather = gather_var->GetMutable<LoDTensor>(); auto *gather = gather_var->GetMutable<LoDTensor>();
gathers.emplace_back(gather); gathers.emplace_back(gather);
...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
} }
} }
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); platform::is_gpu_place(ins[0]->place()), true,
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
ins[0]->place()));
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(outs[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
outs[0]->place()));
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
"The nccl contexts are NULL."));
int dtype = -1; int dtype = -1;
size_t in_numel = 0; size_t in_numel = 0;
size_t out_numel = 0; size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1); PADDLE_ENFORCE_GT(
nranks_, 1,
platform::errors::PreconditionNotMet(
"The number of ranks should be > 1, but got %d.", nranks_));
std::vector<std::function<void()>> all_gather_calls; std::vector<std::function<void()>> all_gather_calls;
std::vector<std::function<void()>> sparse_reduce_calls; std::vector<std::function<void()>> sparse_reduce_calls;
...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel; in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0); PADDLE_ENFORCE_EQ(in_numel % 2, 0,
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k)); platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel; out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce( PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
gather_buff, k, out_tensor_buf, gather_buff, k, out_tensor_buf,
static_cast<int>(out_numel), nranks_, stream), static_cast<int>(out_numel), nranks_, stream),
true); true, platform::errors::Unavailable(
"Calling sparseReduce() failed."));
}); });
} }
...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc( ...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name); auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k; auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *scope = local_exec_scopes_[0]; auto *scope = local_exec_scopes_[0];
auto var = scope->FindVar(var_name); auto var = scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
auto tensor = var->Get<LoDTensor>().data<float>(); auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor; return *tensor;
} }
...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() { ...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
} }
auto counter_name = g_dgc_counter_name; auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step; auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *local_scope = local_exec_scopes_[0]; auto *local_scope = local_exec_scopes_[0];
auto count_var = local_scope->FindVar(counter_name); auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name); auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, PADDLE_ENFORCE_NOT_NULL(
step_var); count_var, platform::errors::NotFound(
} "Variable %s is not found in scope.", counter_name));
PADDLE_ENFORCE_NOT_NULL(
step_var, platform::errors::NotFound("Variable %s is not found in scope.",
step_var));
float count = *count_var->Get<LoDTensor>().data<float>(); float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>(); float step = *step_var->Get<LoDTensor>().data<float>();
......
...@@ -74,7 +74,9 @@ class PullDenseWorker { ...@@ -74,7 +74,9 @@ class PullDenseWorker {
virtual void Initialize(const TrainerDesc& param); virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) { void AddPlace(const paddle::platform::Place place) {
places_.push_back(place); places_.push_back(place);
} }
...@@ -135,9 +137,9 @@ class PullDenseWorker { ...@@ -135,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_; std::vector<cudaStream_t> copy_streams_;
#endif
std::vector<paddle::platform::Place> places_; std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_; std::vector<Scope*> thread_scopes_;
#endif
}; };
// should incorporate different type of device // should incorporate different type of device
...@@ -161,6 +163,7 @@ class DeviceWorker { ...@@ -161,6 +163,7 @@ class DeviceWorker {
virtual void SetDataFeed(DataFeed* data_feed); virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {} virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {} virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void GetXpuOpIndex() {}
virtual void SetNeedDumpField(bool need_dump_field) { virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field; need_dump_field_ = need_dump_field;
} }
......
...@@ -97,6 +97,7 @@ message AsyncConfig { ...@@ -97,6 +97,7 @@ message AsyncConfig {
optional int32 thread_pool_size = 6 [ default = 1 ]; optional int32 thread_pool_size = 6 [ default = 1 ];
optional int32 send_wait_times = 7 [ default = 1 ]; optional int32 send_wait_times = 7 [ default = 1 ];
optional bool runtime_split_send_recv = 8 [ default = false ]; optional bool runtime_split_send_recv = 8 [ default = false ];
optional bool launch_barrier = 9 [ default = true ];
} }
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; } message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
......
...@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync( ...@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
push_sparse_status->push_back(std::move(status)); push_sparse_status->push_back(std::move(status));
} }
} }
#endif
#ifdef PADDLE_WITH_XPU
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place) {
#ifdef PADDLE_WITH_PSLIB
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g_data = tensor->data<float>();
Variable* pin_var = scope.FindVar(t + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g =
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
memory::Copy(platform::CPUPlace(), pin_g,
BOOST_GET_CONST(platform::XPUPlace, place), g_data,
sizeof(float) * count);
float* g = pin_g;
if (scale_datanorm >= 0) {
if (t.find(".batch_size@GRAD") != std::string::npos ||
t.find(".batch_sum@GRAD") != std::string::npos) {
Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
float scale = 1.0 / batch_size;
mat *= scale;
} else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
VLOG(3) << "epsilon: " << scale_datanorm;
for (int i = 0; i < count; ++i) {
g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
batch_size * scale_datanorm;
}
}
}
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
if (push_sparse_status) {
push_sparse_status->push_back(std::move(status));
}
#endif
}
#endif #endif
void FleetWrapper::PushDenseVarsAsync( void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
......
...@@ -160,6 +160,14 @@ class FleetWrapper { ...@@ -160,6 +160,14 @@ class FleetWrapper {
float scale_datanorm, int batch_size, float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream, const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event); cudaEvent_t event);
#endif
#ifdef PADDLE_WITH_XPU
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place);
#endif #endif
void PushDenseVarsAsync( void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
......
...@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope, ...@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
if (platform::is_cpu_place(tensor->place())) { if (platform::is_cpu_place(tensor->place())) {
memcpy(data_ptr, tensor->data<void>(), memcpy(data_ptr, tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type())); tensor->numel() * SizeOfType(tensor->type()));
#ifdef PADDLE_WITH_CUDA
} else { } else {
#ifdef PADDLE_WITH_CUDA
memory::Copy(platform::CPUPlace(), data_ptr, memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data<void>(), tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()), nullptr); tensor->numel() * SizeOfType(tensor->type()), nullptr);
}
#else
}
#endif #endif
#ifdef PADDLE_WITH_XPU
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
} }
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void HeterWrapper::DeSerializeToTensor(Scope* scope, void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var, const VariableMessage& req_var,
platform::Place place, platform::Place place,
cudaStream_t stream) { cudaStream_t stream) {
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
std::vector<int> vec_dim;
for (auto& x : req_var.dims()) {
vec_dim.push_back(x);
}
tensor->Resize(make_ddim(vec_dim));
LoD lod;
for (int i = 0; i < req_var.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
v.push_back(req_var.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else #else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
#endif
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
void HeterWrapper::DeSerializeToTensor(Scope* scope, void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var, const VariableMessage& req_var,
platform::Place place) { platform::Place place) {
#endif
// const VariableMessage& req_var = request->vars(); // const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname()); auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>(); auto* tensor = var->GetMutable<LoDTensor>();
...@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, ...@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void* tensor_data = void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type())); tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(), platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream); tensor->numel() * SizeOfType(tensor->type()));
#else #else
memcpy(tensor_data, req_var.data().data(), memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type())); tensor->numel() * SizeOfType(tensor->type()));
...@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType( ...@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
case VariableMessage::BOOL: case VariableMessage::BOOL:
return framework::proto::VarType::BOOL; // NOLINT return framework::proto::VarType::BOOL; // NOLINT
default: default:
VLOG(0) << "Not support type " << type; PADDLE_THROW(platform::errors::InvalidArgument(
"ToVarType:Unsupported type %d", type));
} }
} }
......
...@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) #include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
int place_num = trainer_desc.worker_places_size(); int place_num = trainer_desc.worker_places_size();
for (int i = 0; i < place_num; ++i) { for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i); int num = trainer_desc.worker_places(i);
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace place = platform::CUDAPlace(num); platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream; cudaStream_t stream;
...@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
events_.push_back(event); events_.push_back(event);
#endif
#ifdef PADDLE_WITH_XPU
platform::XPUPlace place = platform::XPUPlace(num);
places_.push_back(place);
#endif
} }
// thread_num_ = trainer_desc.thread_num(); // thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset); // SetDataset(dataset);
...@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
auto place = places_[num]; auto place = places_[num];
Scope* scope = place_scopes_[num]; Scope* scope = place_scopes_[num];
#ifdef PADDLE_WITH_CUDA
auto stream = copy_streams_[num]; auto stream = copy_streams_[num];
auto event = events_[num]; auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
#endif
#ifdef PADDLE_WITH_XPU
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
#endif
auto& block = program.Block(0); auto& block = program.Block(0);
for (auto& var : block.AllVars()) { for (auto& var : block.AllVars()) {
if (var->Persistable()) { if (var->Persistable()) {
...@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { ...@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \ HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \ } \
} while (0) } while (0)
#define HeterMemcpyXpuFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
} \
} while (0)
#ifdef PADDLE_WITH_CUDA
_ForEachDataType_(HeterMemcpyFunc); _ForEachDataType_(HeterMemcpyFunc);
#endif
#ifdef PADDLE_WITH_XPU
_ForEachDataType_(HeterMemcpyXpuFunc);
#endif
} }
} }
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event); cudaEventSynchronize(event);
#endif
} }
#ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor, LoDTensor* root_tensor,
...@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, ...@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
root_ptr, sizeof(T) * root_tensor->numel(), stream); root_ptr, sizeof(T) * root_tensor->numel(), stream);
} }
} }
#endif
#ifdef PADDLE_WITH_XPU
template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
const paddle::platform::Place& thread_place) {
T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel());
} else {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel());
}
}
#endif
void HeterXpuTrainer::DumpWork(int tid) {} void HeterXpuTrainer::DumpWork(int tid) {}
...@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
CreateThreadParam(main_program, i); CreateThreadParam(main_program, i);
pull_dense_worker_->AddThreadScope(scope); pull_dense_worker_->AddThreadScope(scope);
pull_dense_worker_->AddPlace(places_[i]); pull_dense_worker_->AddPlace(places_[i]);
#ifdef PADDLE_WITH_CUDA
pull_dense_worker_->AddStream(copy_streams_[i]); pull_dense_worker_->AddStream(copy_streams_[i]);
#endif
} }
pull_dense_worker_->Start(); pull_dense_worker_->Start();
#ifdef PADDLE_WITH_CUDA
for (auto& stream : copy_streams_) { for (auto& stream : copy_streams_) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
#endif
op_names_.clear(); op_names_.clear();
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc); std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
...@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
OperatorBase* local_op_ptr = local_op.release(); OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr); (context->ops_).push_back(local_op_ptr);
} }
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
#endif
object_pool_.Push(context); object_pool_.Push(context);
} }
} }
...@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, ...@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
} \ } \
} while (0) } while (0)
_ForEachDataType_(MergeCallback); _ForEachDataType_(MergeCallback);
if (platform::is_gpu_place(thread_tensor->place())) { if (!platform::is_cpu_place(thread_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id = auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device; BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data<void>(), 0, cudaMemset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type())); thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = thread_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
const platform::XPUDeviceContext* xpu_ctx =
reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
} else { } else {
memset(thread_tensor->data<void>(), 0, memset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type())); thread_tensor->numel() * SizeOfType(thread_tensor->type()));
...@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, ...@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
auto* merge_var = response->add_vars(); auto* merge_var = response->add_vars();
heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_, heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
merge_var); merge_var);
if (platform::is_gpu_place(root_tensor->place())) { if (!platform::is_cpu_place(root_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id = auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device; BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data<void>(), 0, cudaMemset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type())); root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = root_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
const platform::XPUDeviceContext* xpu_ctx =
reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
} else { } else {
memset(root_tensor->data<void>(), 0, memset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type())); root_tensor->numel() * SizeOfType(root_tensor->type()));
...@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
OperatorBase* local_op_ptr = local_op.release(); OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr); (context->ops_).push_back(local_op_ptr);
} }
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
#endif
} }
context->Reset(); context->Reset();
...@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
auto deserial_timer = auto deserial_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial"); std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
for (int i = 0; i < request->vars_size(); ++i) { for (int i = 0; i < request->vars_size(); ++i) {
#ifdef PADDLE_WITH_CUDA
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place, heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
copy_streams_[context->place_num_]); copy_streams_[context->place_num_]);
#endif
#ifdef PADDLE_WITH_XPU
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
#endif
} }
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, copy_streams_[context->place_num_])); cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
while (cudaEventQuery(context->event_) != cudaSuccess) { while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel"; VLOG(3) << "wait for kernel";
bthread_yield(); bthread_yield();
} }
#endif
} }
{ {
...@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
op->Run(*(context->scope_), place); op->Run(*(context->scope_), place);
} }
} }
#ifdef PADDLE_WITH_CUDA
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>( auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
bthread_yield(); bthread_yield();
} }
} }
#endif
#ifdef PADDLE_WITH_XPU
xpu_wait();
#endif
for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) { for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
const std::string& varname = trainer_desc_.xpu_send_list(i); const std::string& varname = trainer_desc_.xpu_send_list(i);
...@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
++i) { ++i) {
uint64_t tid = uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i)); static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
#ifdef PADDLE_WITH_CUDA
fleet_ptr_->PushDenseVarsAsync( fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid], *(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(), &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_], copy_streams_[context->place_num_], places_[context->place_num_], copy_streams_[context->place_num_],
context->event_); context->event_);
#endif
#ifdef PADDLE_WITH_XPU
fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_]);
#endif
} }
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) { ++i) {
...@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() { ...@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
pull_dense_worker_->Stop(); pull_dense_worker_->Stop();
root_scope_->DropKids(); root_scope_->DropKids();
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#endif #endif
...@@ -1894,8 +1894,7 @@ PDNode *patterns::QuantizePlacement::operator()( ...@@ -1894,8 +1894,7 @@ PDNode *patterns::QuantizePlacement::operator()(
PDNode *patterns::Bfloat16Placement::operator()( PDNode *patterns::Bfloat16Placement::operator()(
const std::unordered_set<std::string> &bfloat16_enabled_op_types) { const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
std::unordered_set<std::string> supported_op_types = std::unordered_set<std::string> supported_op_types{"conv2d"};
std::unordered_set<std::string>();
if (!bfloat16_enabled_op_types.empty()) { if (!bfloat16_enabled_op_types.empty()) {
supported_op_types = bfloat16_enabled_op_types; supported_op_types = bfloat16_enabled_op_types;
} }
......
...@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { ...@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
fleet_ptr_ = FleetWrapper::GetInstance(); fleet_ptr_ = FleetWrapper::GetInstance();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
copy_streams_.clear(); copy_streams_.clear();
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
places_.clear(); places_.clear();
thread_scopes_.clear(); thread_scopes_.clear();
#endif #endif
} }
void PullDenseWorker::CreatePinVar() { void PullDenseWorker::CreatePinVar() {
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
// for (auto& v : dense_value_names_) { // for (auto& v : dense_value_names_) {
// for (auto& name : v.second) { // for (auto& name : v.second) {
for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
...@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() { ...@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
auto* ptr = root_scope_->Var(name + "pin"); auto* ptr = root_scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR); InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>(); LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
#ifdef PADDLE_WITH_CUDA
pin_tensor->mutable_data<float>(tensor->dims(), pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace()); platform::CUDAPinnedPlace());
#endif
#ifdef PADDLE_WITH_XPU
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
#endif
} }
} }
#endif #endif
...@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit(-1); exit(-1);
} }
status_vec->resize(0); status_vec->resize(0);
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// for (auto& v : dense_value_names_) { // for (auto& v : dense_value_names_) {
...@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable* var = thread_scopes_[i]->FindVar(name); Variable* var = thread_scopes_[i]->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>(); float* w = tensor->data<float>();
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w, platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]); sizeof(float) * tensor->numel(), copy_streams_[i]);
#endif
#ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
platform::CPUPlace(), pin_w,
sizeof(float) * tensor->numel());
#endif
} }
} }
} }
...@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) { ...@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i)); dwp_param_.program_config(0).pull_dense_table_id(i));
if (force_update || CheckUpdateParam(tid)) { if (force_update || CheckUpdateParam(tid)) {
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
VLOG(3) << "pull dense " << force_update << " " << tid; VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, false); &pull_dense_status_, false);
......
...@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer { ...@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_; std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
}; };
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class HeterServiceContext { class HeterServiceContext {
public: public:
HeterServiceContext() {} HeterServiceContext() {}
...@@ -151,7 +152,9 @@ class HeterServiceContext { ...@@ -151,7 +152,9 @@ class HeterServiceContext {
void Reset() { push_dense_status_.clear(); } void Reset() { push_dense_status_.clear(); }
int place_num_; int place_num_;
Scope* scope_{nullptr}; Scope* scope_{nullptr};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_; cudaEvent_t event_;
#endif
std::vector<OperatorBase*> ops_; std::vector<OperatorBase*> ops_;
std::vector<::std::future<int32_t>> push_dense_status_; std::vector<::std::future<int32_t>> push_dense_status_;
}; };
...@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase { ...@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
virtual void CacheProgram(const ProgramDesc& main_program) { virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program); new (&program_) ProgramDesc(main_program);
} }
virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {}
template <typename T> template <typename T>
#ifdef PADDLE_WITH_CUDA
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place, const paddle::platform::Place& thread_place,
cudaStream_t stream); cudaStream_t stream);
#endif
#ifdef PADDLE_WITH_XPU
void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place);
#endif
void CreateThreadParam(const ProgramDesc& program, int num); void CreateThreadParam(const ProgramDesc& program, int num);
template <typename T> template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
...@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase { ...@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
std::vector<std::string> op_names_; std::vector<std::string> op_names_;
std::vector<Scope*> place_scopes_; std::vector<Scope*> place_scopes_;
BtObjectPool<HeterServiceContext> object_pool_; BtObjectPool<HeterServiceContext> object_pool_;
std::vector<cudaStream_t> copy_streams_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<cudaEvent_t> events_; std::vector<cudaEvent_t> events_;
#endif
}; };
#endif #endif
......
...@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer( ...@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(HeterXpuTrainer); REGISTER_TRAINER_CLASS(HeterXpuTrainer);
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/gradient_accumulator.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
return; return;
} }
PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true, PADDLE_ENFORCE_EQ(
"dst_numel %d vs. src_numel %d", dst_tensor->numel(), dst_tensor->numel(), numel,
numel); platform::errors::PreconditionNotMet(
"The number of elements of source tensor and destination tensor "
"should be equal, but got the number of elements of source tensor is "
"%zu and the number of elements of destination tensor is %zu.",
numel, dst_tensor->numel()));
auto data_type = src_tensor.type(); auto data_type = src_tensor.type();
auto place = src_tensor.place(); auto place = src_tensor.place();
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -203,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( ...@@ -203,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
void ProgramDescTracer::InsertVarIfNotExist( void ProgramDescTracer::InsertVarIfNotExist(
const std::shared_ptr<VarBase> &new_var, bool is_input) { const std::shared_ptr<VarBase> &new_var, bool is_input) {
PADDLE_ENFORCE_NOT_NULL(new_var); PADDLE_ENFORCE_NOT_NULL(new_var, platform::errors::InvalidArgument(
"The variable to insert is NULL."));
if (vars_.count(new_var) != 0) return; if (vars_.count(new_var) != 0) return;
auto new_var_desc = new framework::VarDesc(""); auto new_var_desc = new framework::VarDesc("");
...@@ -220,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist( ...@@ -220,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
} }
const auto &inner_var = new_var->Var(); const auto &inner_var = new_var->Var();
PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true); PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true,
platform::errors::InvalidArgument(
"The variable to insert is not initialized."));
if (inner_var.IsType<framework::LoDTensor>()) { if (inner_var.IsType<framework::LoDTensor>()) {
const auto &tensor = inner_var.Get<framework::LoDTensor>(); const auto &tensor = inner_var.Get<framework::LoDTensor>();
new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR); new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
...@@ -232,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist( ...@@ -232,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
new_var_desc->SetDataType(framework::proto::VarType::FP32); new_var_desc->SetDataType(framework::proto::VarType::FP32);
} }
} else { } else {
PADDLE_THROW("Not support variable type %s", PADDLE_THROW(platform::errors::InvalidArgument(
framework::ToTypeName(inner_var.Type())); "Not support variable type %s.",
framework::ToTypeName(inner_var.Type())));
} }
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
namespace paddle { namespace paddle {
...@@ -21,8 +22,10 @@ namespace imperative { ...@@ -21,8 +22,10 @@ namespace imperative {
void NCCLParallelContext::RecvNCCLID(const std::string &ep, void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) { ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':'); auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL, PADDLE_ENFORCE_EQ(
"The endpoint should contain host and port: %s", ep); addr.size(), 2UL,
platform::errors::InvalidArgument(
"The endpoint should contain host and port, but got %s.", ep));
std::string host = addr[0]; std::string host = addr[0];
int port = std::stoi(addr[1]); int port = std::stoi(addr[1]);
...@@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep, ...@@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
char buffer[1024] = {0}; char buffer[1024] = {0};
int opt = 0; int opt = 0;
// creating socket fd // creating socket fd
if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) {
PADDLE_THROW("create server fd failed"); PADDLE_THROW(
if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) platform::errors::Unavailable("Create server file descriptor failed."));
PADDLE_THROW("set socket opt failed"); }
if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) {
PADDLE_THROW(platform::errors::Unavailable("Set socket options failed."));
}
address.sin_family = AF_INET; address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY; address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port); address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
PADDLE_THROW("binding failed on ep: %s", ep); PADDLE_THROW(
platform::errors::Unavailable("Bind on endpoint %s failed.", ep));
}
VLOG(3) << "listening on: " << ep; VLOG(3) << "listening on: " << ep;
if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed"); if (listen(server_fd, 3) < 0) {
PADDLE_THROW(platform::errors::Unavailable(
"Listen on server file descriptor failed."));
}
if ((new_socket = if ((new_socket =
accept(server_fd, reinterpret_cast<struct sockaddr *>(&address), accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
reinterpret_cast<socklen_t *>(&addrlen))) < 0) reinterpret_cast<socklen_t *>(&addrlen))) < 0) {
PADDLE_THROW("accept the new socket fd failed"); PADDLE_THROW(platform::errors::Unavailable(
"Accept the new socket file descriptor failed."));
}
if (read(new_socket, buffer, 1024) < 0) {
PADDLE_THROW(platform::errors::Unavailable("Read from socket failed."));
}
if (read(new_socket, buffer, 1024) < 0)
PADDLE_THROW("reading the ncclUniqueId from socket failed");
VLOG(3) << "recevived the ncclUniqueId"; VLOG(3) << "recevived the ncclUniqueId";
memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES); memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
...@@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep, ...@@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
void NCCLParallelContext::SendNCCLID(const std::string &ep, void NCCLParallelContext::SendNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) { ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':'); auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL, PADDLE_ENFORCE_EQ(
"The endpoint should contain host and port: %s", ep); addr.size(), 2UL,
platform::errors::InvalidArgument(
"The endpoint should contain host and port, but got %s.", ep));
std::string host = addr[0]; std::string host = addr[0];
int port = std::stoi(addr[1]); int port = std::stoi(addr[1]);
// struct sockaddr_in address; // struct sockaddr_in address;
...@@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, ...@@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
char buffer[1024] = {0}; char buffer[1024] = {0};
memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES); memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
PADDLE_THROW("create socket failed"); PADDLE_THROW(platform::errors::Unavailable("Create socket failed."));
}
memset(&serv_addr, '0', sizeof(serv_addr)); memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET; serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port); serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
PADDLE_THROW("invalied address: %s", ep); PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
}
int try_times = 0; int try_times = 0;
while (true) { while (true) {
......
...@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) { ...@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
"Baidu Kunlun Card is properly installed.", "Baidu Kunlun Card is properly installed.",
ret)); ret));
ret = xpu_malloc(reinterpret_cast<void **>(&p), size); ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, PADDLE_ENFORCE_EQ(
platform::errors::External( ret, XPU_SUCCESS,
"XPU API return wrong value[%d], please check whether " platform::errors::External(
"Baidu Kunlun Card is properly installed.", "XPU API return wrong value[%d], no enough memory", ret));
ret));
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"xpu memory FLAGS_init_allocated_mem is not implemented.")); "xpu memory FLAGS_init_allocated_mem is not implemented."));
......
...@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { ...@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
} }
}; };
// // AbsGrad: dx=dy if x >=0 else -dy
// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
template <typename T>
class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("abs_grad_grad");
// input1: x
op->SetInput("X", this->Input("X"));
// input2: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(this->Attrs());
// output: ddy
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
// ReluGrad: dx = dy if y >= 0 else 0 // ReluGrad: dx = dy if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0
//
template <typename T> template <typename T>
class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> { class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public: public:
...@@ -873,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> { ...@@ -873,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
} }
}; };
// log Grad: dx = dout / x
// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x)
template <typename T>
class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("log_grad_grad");
op->SetInput("X", this->Input("X"));
// X@GRAD@GRAD: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
op->SetAttrMap(this->Attrs());
// X@GRAD: dx
op->SetOutput("DX", this->InputGrad("X"));
// Out@GRAD@GRAD: ddy
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer,
{framework::GradVarName("Out"), {framework::GradVarName("Out"),
framework::GradVarName("X")}); framework::GradVarName("X")});
...@@ -1214,7 +1254,13 @@ REGISTER_OPERATOR( ...@@ -1214,7 +1254,13 @@ REGISTER_OPERATOR(
std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(), std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(),
ops::ActFwdInplaceInferer, void>::type); ops::ActFwdInplaceInferer, void>::type);
REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad, REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer); ops::ActivationGradOpInplaceInferer,
ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(
abs_grad_grad,
ops::ActivationOpDoubleGrad<ops::AbsGradGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(abs, REGISTER_OP_CPU_KERNEL(abs,
ops::ActivationKernel<paddle::platform::CPUDeviceContext, ops::ActivationKernel<paddle::platform::CPUDeviceContext,
...@@ -1234,6 +1280,47 @@ REGISTER_OP_CPU_KERNEL( ...@@ -1234,6 +1280,47 @@ REGISTER_OP_CPU_KERNEL(
ops::AbsGradFunctor<int>>, ops::AbsGradFunctor<int>>,
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::AbsGradFunctor<int64_t>>); ops::AbsGradFunctor<int64_t>>);
REGISTER_OP_CPU_KERNEL(
abs_grad_grad,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<plat::float16>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<int>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_OPERATOR(
log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
paddle::framework::OpDesc>,
ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
paddle::imperative::OpBase>,
ops::ActFwdInplaceInferer);
REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer,
ops::LogDoubleGradMaker<paddle::framework::OpDesc>,
ops::LogDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(
log_grad_grad,
ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CPU_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== register checkpoint ===========================*/ /* ========================== register checkpoint ===========================*/
......
...@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::ExpGradFunctor<plat::float16>>); ops::ExpGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== exp register ============================ */ /* ========================== abs register ============================ */
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>, abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>,
...@@ -180,4 +180,28 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -180,4 +180,28 @@ REGISTER_OP_CUDA_KERNEL(
ops::AbsGradFunctor<int64_t>>, ops::AbsGradFunctor<int64_t>>,
ops::ActivationGradKernel<plat::CUDADeviceContext, ops::ActivationGradKernel<plat::CUDADeviceContext,
ops::AbsGradFunctor<plat::float16>>); ops::AbsGradFunctor<plat::float16>>);
REGISTER_OP_CUDA_KERNEL(
abs_grad_grad,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
ops::AbsGradGradFunctor<plat::float16>>,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<int>>,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CUDA_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
...@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel ...@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
} }
}; };
template <typename T>
struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* Out, const framework::Tensor* ddX,
framework::Tensor* ddOut, framework::Tensor* dOut,
framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad"));
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad"));
ddout.device(*d) = ddx * x.sign();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
};
template <typename T> template <typename T>
struct ReluGradGradFunctor : public BaseActivationFunctor<T> { struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device> template <typename Device>
...@@ -1642,6 +1663,10 @@ class SquareDoubleGradKernel ...@@ -1642,6 +1663,10 @@ class SquareDoubleGradKernel
} }
}; };
template <typename DeviceContext, typename Functor>
class LogDoubleGradKernel
: public SquareDoubleGradKernel<DeviceContext, Functor> {};
template <typename DeviceContext, typename Functor> template <typename DeviceContext, typename Functor>
class ELUDoubleGradKernel class ELUDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> { : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
...@@ -1831,6 +1856,37 @@ class PowGradKernel ...@@ -1831,6 +1856,37 @@ class PowGradKernel
functor(*place, x, out, dout, dx); functor(*place, x, out, dout, dx);
} }
}; };
template <typename T>
struct LogGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* ddX, framework::Tensor* ddOut,
const framework::Tensor* dOut, framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
// ddout = ddx / x; dx = -(dout / x) * (ddx / x)
// calculate dx first, so ddout can inplace ddx
if (dX) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
auto dx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
ddout.device(*d) = ddx * static_cast<T>(1) / x;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -1851,7 +1907,6 @@ class PowGradKernel ...@@ -1851,7 +1907,6 @@ class PowGradKernel
__macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, Log, LogFunctor, LogGradFunctor); \
__macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \
__macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/activation_op.h"
#include <string>
#include "paddle/fluid/platform/xpu_header.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
template <typename Functor>
class XPUActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename Functor>
class XPUActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename DeviceContext, typename T>
void xpu_activation_forward(const framework::ExecutionContext &ctx,
xpu::Activation_t type) {
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
int r = 0;
if (xpu::Activation_t::ACT_POW == type.type) {
type.pow_factor = ctx.Attr<float>("factor");
}
auto xpu_context = ctx.device_context<DeviceContext>().x_context();
r = xpu::activation_forward(xpu_context, type, x->numel(),
reinterpret_cast<const float *>(x_data),
reinterpret_cast<float *>(y_data));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
template <typename DeviceContext, typename T>
void xpu_activation_backward(const framework::ExecutionContext &ctx,
xpu::Activation_t type) {
/* TODO: relu tanh sigmoid are inplace */
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Out");
auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const T *x_data = nullptr;
const T *y_data = nullptr;
const T *y_grad = nullptr;
if (x != nullptr) x_data = x->data<T>();
if (y != nullptr) y_data = y->data<T>();
if (dOut != nullptr) y_grad = dOut->data<T>();
T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
auto xpu_context = ctx.device_context<DeviceContext>().x_context();
int r = xpu::activation_backward(xpu_context, type, dX->numel(),
reinterpret_cast<const float *>(x_data),
reinterpret_cast<const float *>(y_data),
reinterpret_cast<const float *>(y_grad),
reinterpret_cast<float *>(x_grad));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
template <typename T, xpu::Activation_t::act_enum algorithm>
struct XPUActivationFunc : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
algorithm);
}
};
template <typename T, xpu::Activation_t::act_enum algorithm>
struct XPUActivationGradFunc : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(ctx,
algorithm);
}
};
template <typename T>
using XPUReluFunctor = XPUActivationFunc<T, xpu::Activation_t::RELU>;
template <typename T>
using XPUSigmoidFunctor = XPUActivationFunc<T, xpu::Activation_t::SIGMOID>;
template <typename T>
using XPUTanhFunctor = XPUActivationFunc<T, xpu::Activation_t::TANH>;
template <typename T>
using XPUGeluFunctor = XPUActivationFunc<T, xpu::Activation_t::GELU>;
template <typename T>
using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
template <typename T>
using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
template <typename T>
using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
template <typename T>
using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
template <typename T>
using XPUSigmoidGradFunctor =
XPUActivationGradFunc<T, xpu::Activation_t::SIGMOID>;
template <typename T>
using XPUTanhGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::TANH>;
template <typename T>
using XPUGeluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::GELU>;
template <typename T>
using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
template <typename T>
using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
template <typename T>
using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
template <typename T>
using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_XPU_KERNEL(act_type, \
ops::XPUActivationKernel<ops::functor<float>>); \
REGISTER_OP_XPU_KERNEL( \
act_type##_grad, \
ops::XPUActivationGradKernel<ops::grad_functor<float>>);
REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
XPUSigmoidGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
REGISTER_OP_XPU_KERNEL(log,
ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
REGISTER_OP_XPU_KERNEL(pow,
ops::XPUActivationKernel<ops::XPUACTPowFunctor<float>>);
REGISTER_OP_XPU_KERNEL(abs,
ops::XPUActivationKernel<ops::XPUABSFunctor<float>>);
#endif // PADDLE_WITH_XPU
...@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const { ...@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
op->SetInput("SavedMean", this->Input("SavedMean")); op->SetInput("SavedMean", this->Input("SavedMean"));
op->SetInput("SavedVariance", this->Input("SavedVariance")); op->SetInput("SavedVariance", this->Input("SavedVariance"));
if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) { if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
op->SetInput("Mean", this->Input("Mean"));
op->SetInput("Variance", this->Input("Variance")); op->SetInput("Variance", this->Input("Variance"));
} }
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
...@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape( ...@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
"BatchNormDoubleGrad"); "BatchNormDoubleGrad");
} }
OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad"); OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
// check output // check output
OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad"); OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const int C = x_dims[1]; const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
const int C =
((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
? x_dims[1]
: x_dims[x_dims.size() - 1]);
if (ctx->HasOutput("DX")) { if (ctx->HasOutput("DX")) {
ctx->SetOutputDim("DX", x_dims); ctx->SetOutputDim("DX", x_dims);
} }
...@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
Tensor inv_var_tensor; Tensor inv_var_tensor;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_variance = ctx.Input<Tensor>("Variance"); const auto *running_variance = ctx.Input<Tensor>("Variance");
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C}); inv_var_tensor.Resize({C});
T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace()); T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
...@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy, // np.sum(dy,
// axis=(n,h,w)) * (x - mean) * // axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var // inv_var
// * // *
// np.mean(dy, axis=(n,h,w)) - // np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))) // axis=(n,h,w)))
if (ddX) { if (ddX) {
dx_arr += dx_arr +=
...@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
C, sample_size); C, sample_size);
ddy_arr.setZero(); ddy_arr.setZero();
if (use_global_stats) { if (use_global_stats) {
// math: ddy = r * ddx * inv_var // math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if (ddX) { if (ddX) {
ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
} }
...@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
.replicate(1, sample_size) / .replicate(1, sample_size) /
sample_size); sample_size);
} }
if (ddScale && ddBias) { }
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C); if (ddScale) {
Tensor ddscale_tile; ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
ddscale_tile.Resize({C, sample_size}); Tensor ddscale_tile;
EigenArrayMap<T> ddscale_tile_data( ddscale_tile.Resize({C, sample_size});
ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size); EigenArrayMap<T> ddscale_tile_data(
ddscale_tile_data = ddscale_arr.replicate(1, sample_size); ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
}
ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C); if (ddBias) {
Tensor ddbias_tile; ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
ddbias_tile.Resize({C, sample_size}); Tensor ddbias_tile;
EigenArrayMap<T> ddbias_tile_data( ddbias_tile.Resize({C, sample_size});
ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size); EigenArrayMap<T> ddbias_tile_data(
ddbias_tile_data = ddbias_arr.replicate(1, sample_size); ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; ddy_arr += ddbias_tile_data;
ddy_arr += ddbias_tile_data;
}
} }
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<paddle::platform::CPUDeviceContext, T>( TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
......
...@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) { ...@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
CreateInputVarDesc(); CreateInputVarDesc();
CreateOutputVarDesc(); CreateOutputVarDesc();
} else { } else {
PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.", PADDLE_THROW(platform::errors::NotFound(
config_.op_type)); "Operator '%s' is not registered in OpTester.", config_.op_type));
} }
if (config_.device_id >= 0) { if (config_.device_id >= 0) {
...@@ -81,7 +81,8 @@ void OpTester::Run() { ...@@ -81,7 +81,8 @@ void OpTester::Run() {
platform::EnableProfiler(platform::ProfilerState::kAll); platform::EnableProfiler(platform::ProfilerState::kAll);
platform::SetDeviceId(config_.device_id); platform::SetDeviceId(config_.device_id);
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW(platform::errors::PermissionDenied(
"'CUDAPlace' is not supported in CPU only device."));
#endif #endif
} }
...@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { ...@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
} else if (str == "fp64") { } else if (str == "fp64") {
return framework::proto::VarType::FP64; return framework::proto::VarType::FP64;
} else { } else {
PADDLE_THROW("Unsupported dtype %s.", str.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported dtype %s in OpTester.", str.c_str()));
} }
} }
...@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() { ...@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
case framework::proto::AttrType::INTS: case framework::proto::AttrType::INTS:
case framework::proto::AttrType::FLOATS: case framework::proto::AttrType::FLOATS:
case framework::proto::AttrType::STRINGS: case framework::proto::AttrType::STRINGS:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unimplemented(
platform::errors::Unimplemented("Not supported STRINGS type yet.")); "Unsupported STRINGS type in OpTester yet."));
break; break;
case framework::proto::AttrType::LONG: { case framework::proto::AttrType::LONG: {
int64_t value = StringTo<int64_t>(value_str); int64_t value = StringTo<int64_t>(value_str);
...@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() { ...@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
} break; } break;
case framework::proto::AttrType::LONGS: case framework::proto::AttrType::LONGS:
default: default:
PADDLE_THROW("Unsupport attr type %d", type); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupport attr type %d in OpTester.", type));
} }
} }
} }
...@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor, ...@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
} }
is.close(); is.close();
} else { } else {
PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported initializer %s in OpTester.", initializer.c_str()));
} }
if (!platform::is_cpu_place(place_)) { if (!platform::is_cpu_place(place_)) {
...@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) { ...@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
static_cast<double>(1.0), item.second.initializer, static_cast<double>(1.0), item.second.initializer,
item.second.filename); item.second.filename);
} else { } else {
PADDLE_THROW("Unsupported dtype %d.", data_type); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported dtype %d in OpTester.", data_type));
} }
VLOG(3) << "Set lod for tensor " << var_name; VLOG(3) << "Set lod for tensor " << var_name;
...@@ -473,7 +478,8 @@ std::string OpTester::DebugString() { ...@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
<< "\n"; << "\n";
} break; } break;
default: default:
PADDLE_THROW("Unsupport attr type %d", attr_type); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupport attr type %d in OpTester.", attr_type));
} }
ss << GenSpaces(--count) << "}\n"; ss << GenSpaces(--count) << "}\n";
} }
...@@ -484,8 +490,10 @@ std::string OpTester::DebugString() { ...@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
TEST(op_tester, base) { TEST(op_tester, base) {
if (!FLAGS_op_config_list.empty()) { if (!FLAGS_op_config_list.empty()) {
std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary); std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", PADDLE_ENFORCE_EQ(
FLAGS_op_config_list.c_str()); static_cast<bool>(fin), true,
platform::errors::InvalidArgument("OpTester cannot open file %s",
FLAGS_op_config_list.c_str()));
std::vector<OpTesterConfig> op_configs; std::vector<OpTesterConfig> op_configs;
while (!fin.eof()) { while (!fin.eof()) {
VLOG(4) << "Reading config " << op_configs.size() << "..."; VLOG(4) << "Reading config " << op_configs.size() << "...";
......
...@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) { ...@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
} else if (dtype_str == "fp64" || dtype_str == "double") { } else if (dtype_str == "fp64" || dtype_str == "double") {
dtype = "fp64"; dtype = "fp64";
} else { } else {
PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported dtype %s in OpInputConfig.", dtype_str.c_str()));
} }
VLOG(4) << "dtype of input " << name << " is: " << dtype; VLOG(4) << "dtype of input " << name << " is: " << dtype;
} }
...@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) { ...@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
const std::vector<std::string> supported_initializers = {"random", "natural", const std::vector<std::string> supported_initializers = {"random", "natural",
"zeros", "file"}; "zeros", "file"};
if (!Has(supported_initializers, initializer_str)) { if (!Has(supported_initializers, initializer_str)) {
PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported initializer %s in OpInputConfig.",
initializer_str.c_str()));
} }
initializer = initializer_str; initializer = initializer_str;
...@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) { ...@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
} }
} }
EraseEndSep(&lod_str); EraseEndSep(&lod_str);
PADDLE_ENFORCE_GE(lod_str.length(), 4U); PADDLE_ENFORCE_GE(
lod_str.length(), 4U,
platform::errors::InvalidArgument(
"The length of lod string should be "
"equal to or larger than 4. But length of lod string is %zu.",
lod_str.length()));
VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
// Parse the lod_str // Parse the lod_str
...@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) { ...@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
OpTesterConfig::OpTesterConfig(const std::string& filename) { OpTesterConfig::OpTesterConfig(const std::string& filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", PADDLE_ENFORCE_EQ(
filename.c_str()); static_cast<bool>(fin), true,
platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.",
filename.c_str()));
Init(fin); Init(fin);
} }
......
...@@ -166,7 +166,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -166,7 +166,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
#endif #endif
if (input_data_type != framework::proto::VarType::INT8 && if (input_data_type != framework::proto::VarType::INT8 &&
input_data_type != framework::proto::VarType::UINT8) { input_data_type != framework::proto::VarType::UINT8 &&
input_data_type != framework::proto::VarType::BF16) {
auto filter_data_type = ctx.Input<Tensor>("Filter")->type(); auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -455,6 +456,11 @@ void Conv3DOpMaker::Make() { ...@@ -455,6 +456,11 @@ void Conv3DOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "int8", "bfloat16"});
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel") AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<std::string>("fuse_activation", AddAttr<std::string>("fuse_activation",
......
...@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx, ...@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx,
template <typename T> template <typename T>
void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto cpu_place = platform::CPUPlace();
auto &cpu_ctx = *pool.Get(cpu_place);
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id); distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
...@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { ...@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
if (rpc_ctx.origin_varnames.size() == 1 && if (rpc_ctx.origin_varnames.size() == 1 &&
rpc_ctx.splited_varnames.size() == 1) { rpc_ctx.splited_varnames.size() == 1) {
auto varname = rpc_ctx.origin_varnames[0]; auto varname = rpc_ctx.origin_varnames[0];
VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0]; const auto place =
rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx, scope.FindVar(varname)->Get<framework::LoDTensor>().place();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place);
VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
<< platform::is_gpu_place(place);
rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
scope, varname, varname)); scope, varname, varname));
for (size_t i = 0; i < rets.size(); i++) { for (size_t i = 0; i < rets.size(); i++) {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
XPUElementwise<T, XPUAddFunctor<T>>(ctx);
}
};
template <typename DeviceContext, typename T>
class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto dx_dims = dout->dims();
auto dy_dims_untrimed = dout->dims();
T *dx_data = NULL;
T *dy_data = NULL;
int axis = ctx.Attr<int>("axis");
PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
if (dx != nullptr) {
dx->mutable_data<T>(ctx.GetPlace());
dx_dims = dx->dims();
dx_data = dx->data<T>();
}
if (dy != nullptr) {
dy->mutable_data<T>(ctx.GetPlace());
dy_dims_untrimed = dy->dims();
dy_data = dy->data<T>();
}
int pre, n, post, is_common_broadcast;
if (dx_dims == dy_dims_untrimed) {
pre = post = 1;
n = dout->numel();
} else {
axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
"Axis should be in range [0, dx_dims)");
auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
&is_common_broadcast);
}
int len = pre * n * post;
auto &dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
if (post == 1) {
int r = xpu::matrix_vector_add_grad(
dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
return;
}
if (dx == nullptr) {
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
}
if (dy == nullptr) {
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
} else {
if (len != n) {
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
len * sizeof(float)),
XPU_SUCCESS, platform::errors::External(
"XPU has no enough memory"));
}
}
int r = xpu::elementwise_add_grad(
dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
if ((dy != nullptr) && (len != n)) {
r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
post, xpu::ElementwiseOp::ASSIGN);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
dev_ctx.Wait();
xpu_free(dy_data);
}
if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
dev_ctx.Wait();
}
if (dx == nullptr) {
xpu_free(dx_data);
}
if (dy == nullptr) {
xpu_free(dy_data);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
elementwise_add,
ops::ElementwiseAddXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradXPUKernel<
paddle::platform::XPUDeviceContext, float>);
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace operators {
template <typename T>
struct XPUAddFunctor {
int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
return xpu::elementwise_add(ctx, x, y, z, len);
}
};
template <typename T>
struct XPUMulFunctor {
int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
return xpu::elementwise_mul(ctx, x, y, z, len);
}
};
template <typename T, typename Functor>
void XPUElementwise(const framework::ExecutionContext& ctx) {
PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
"This kernel only runs on XPU device.");
auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE_NE(x_var, nullptr,
platform::errors::Fatal("Cannot get input Variable X"));
PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
"XPU only support LoDTensor");
auto x = x_var->Get<framework::LoDTensor>();
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
auto x_dims = x.dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post, is_common_broadcast;
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
int len = pre * n * post;
const T* x_data = x.data<T>();
const T* y_data = y->data<T>();
T* z_data = z->data<T>();
T* y_broadcast = nullptr;
auto& dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
if (post == 1) {
if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
z_data, pre, n);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
return;
}
if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
z_data, pre, n);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
return;
}
}
if (pre != 1 || post != 1) {
PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
len * sizeof(T)) == XPU_SUCCESS);
int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
n, post, xpu::ElementwiseOp::ASSIGN);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
y_data = y_broadcast;
}
Functor functor;
int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
if (pre != 1 || post != 1) {
dev_ctx.Wait();
xpu_free(y_broadcast);
}
}
} // namespace operators
} // namespace paddle
#endif
...@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
// np.sum(dy, // np.sum(dy,
// axis=(h,w)) * (x - mean) * // axis=(h,w)) * (x - mean) *
// (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var // (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
// * // inv_var *
// np.mean(dy, axis=(h,w)) - // np.mean(dy, axis=(h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(h,w)))) // axis=(h,w)))
Tensor x_sub_mean_mul_invstd; Tensor x_sub_mean_mul_invstd;
x_sub_mean_mul_invstd.Resize({sample_size, NxC}); x_sub_mean_mul_invstd.Resize({sample_size, NxC});
......
...@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { ...@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
} }
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
template <typename KernelTuple, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchKernelXYZN() { void BenchKernelXYZN() {
using T = typename KernelTuple::data_type; using T = typename KernelTuple::data_type;
...@@ -320,8 +319,15 @@ void BenchKernelSgd() { ...@@ -320,8 +319,15 @@ void BenchKernelSgd() {
const T lr = 0.1; const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower, auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> { const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1); PADDLE_ENFORCE_LE(
PADDLE_ENFORCE_GT(n, 0); static_cast<size_t>(upper - lower), n - 1,
paddle::platform::errors::InvalidArgument(
"The range of Sgd (upper - lower) should be equal to or lower "
"than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
static_cast<size_t>(upper - lower), (n - 1)));
PADDLE_ENFORCE_GT(
n, 0, paddle::platform::errors::InvalidArgument(
"The Sgd size should be larger than 0. But the n is %d.", n));
std::vector<int64_t> all, out; std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
all.push_back(i); all.push_back(i);
......
...@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> { ...@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
} }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const emb_seq_pool_attr_t& attr) const override { const emb_seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.table_height, 0); PADDLE_ENFORCE_GT(attr.table_height, 0,
PADDLE_ENFORCE_GT(attr.table_width, 0); platform::errors::InvalidArgument(
PADDLE_ENFORCE_GT(attr.index_height, 0); "The attribute table_height of EmbSeqPool should "
PADDLE_ENFORCE_GT(attr.index_width, 0); "be larger than 0. But it is %d.",
PADDLE_ENFORCE_GT(attr.out_width, 0); attr.table_height));
PADDLE_ENFORCE_GT(attr.table_width, 0,
platform::errors::InvalidArgument(
"The attribute table_width of EmbSeqPool should "
"be larger than 0. But it is %d.",
attr.table_width));
PADDLE_ENFORCE_GT(attr.index_height, 0,
platform::errors::InvalidArgument(
"The attribute index_height of EmbSeqPool should "
"be larger than 0. But it is %d.",
attr.index_height));
PADDLE_ENFORCE_GT(attr.index_width, 0,
platform::errors::InvalidArgument(
"The attribute index_width of EmbSeqPool should "
"be larger than 0. But it is %d.",
attr.index_width));
PADDLE_ENFORCE_GT(attr.out_width, 0,
platform::errors::InvalidArgument(
"The attribute out_width of EmbSeqPool should be "
"larger than 0. But it is %d.",
attr.out_width));
return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr)); return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() { ...@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
preCode(); preCode();
int block, rest; int block, rest;
const auto groups = packed_groups(n_, k_, &block, &rest); const auto groups = packed_groups(n_, k_, &block, &rest);
PADDLE_ENFORCE_GT(groups.front(), 0); PADDLE_ENFORCE_GT(
groups.front(), 0,
platform::errors::InvalidArgument("The number of rest registers should "
"be larger than 0. But it is %d.",
groups.front()));
const int block_len = sizeof(float) * block; const int block_len = sizeof(float) * block;
const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
...@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> { ...@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
} }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const matmul_attr_t& attr) const override { const matmul_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.m, 0); PADDLE_ENFORCE_GT(
PADDLE_ENFORCE_GT(attr.n, 0); attr.m, 0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_GT(attr.k, 0); "The attribute m (first matrix's row) of MatMul should "
"be larger than 0. But it is %d.",
attr.m));
PADDLE_ENFORCE_GT(
attr.n, 0, platform::errors::InvalidArgument(
"The attribute n (first matrix's col) of MatMul should "
"be larger than 0. But it is %d.",
attr.n));
PADDLE_ENFORCE_GT(
attr.k, 0, platform::errors::InvalidArgument(
"The attribute k (second matrix's col) of MatMul should "
"be larger than 0. But it is %d.",
attr.k));
return make_unique<MatMulJitCode>(attr, CodeSize(attr)); return make_unique<MatMulJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode { ...@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
size_t code_size = 256 * 1024, size_t code_size = 256 * 1024,
void* code_ptr = nullptr) void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
"Jitcode of matmul only support m==1 (first "
"matrix's row) now. But m is %d.",
m_));
this->genCode(); this->genCode();
} }
......
...@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> { ...@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
} }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const seq_pool_attr_t& attr) const override { const seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.w, 0); PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_GT(attr.h, 0); "The attribute width of SeqPool should "
"be larger than 0. But it is %d.",
attr.w));
PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
"The attribute height of SeqPool should "
"be larger than 0. But it is %d.",
attr.h));
return make_unique<SeqPoolJitCode>(attr, CodeSize(attr)); return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode { ...@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
reg_idx++; reg_idx++;
} }
PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, PADDLE_ENFORCE_EQ(
"All heights should use same regs"); reg_idx, rest_used_num_regs,
platform::errors::InvalidArgument(
"All heights of SeqPool should use the same number of registers."
"It equals to the numbr of rest registers. But use %d registers "
"and the numbr of rest registers is %d.",
reg_idx, rest_used_num_regs));
for (int i = 0; i < reg_idx; ++i) { for (int i = 0; i < reg_idx; ++i) {
vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
} }
......
...@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> { ...@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; } size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const sgd_attr_t& attr) const override { const sgd_attr_t& attr) const override {
PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width,
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); platform::errors::InvalidArgument(
PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); "The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d.",
attr.param_width, attr.grad_width));
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d.",
attr.selected_rows_size, attr.grad_height));
PADDLE_ENFORCE_GE(
attr.selected_rows_size, 0,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or larger than 0. But selected_rows_size is %d.",
attr.selected_rows_size));
return make_unique<SgdJitCode>(attr, CodeSize(attr)); return make_unique<SgdJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> { ...@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
} }
std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override { std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
PADDLE_ENFORCE_GT(w, 0); PADDLE_ENFORCE_GT(
w, 0,
platform::errors::InvalidArgument(
"The width of VBroadcast should be larger than 0. But w is %d.",
w));
return make_unique<VBroadcastJitCode>(w, CodeSize(w)); return make_unique<VBroadcastJitCode>(w, CodeSize(w));
} }
}; };
......
...@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void* GenBase::operator new(size_t size) { void* GenBase::operator new(size_t size) {
void* ptr; void* ptr;
constexpr size_t alignment = 32ul; constexpr size_t alignment = 32ul;
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, PADDLE_ENFORCE_EQ(
"GenBase Alloc %ld error!", size); posix_memalign(&ptr, alignment, size), 0,
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); platform::errors::InvalidArgument(
"Jitcode generator (GenBase) allocate %ld memory error!", size));
PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
"Fail to allocate jitcode generator "
"(GenBase) CPU memory: size = %d .",
size));
return ptr; return ptr;
} }
......
...@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) { ...@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
ONE_CASE(kEmbSeqPool); ONE_CASE(kEmbSeqPool);
ONE_CASE(kSgd); ONE_CASE(kSgd);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", kt); PADDLE_THROW(platform::errors::Unimplemented(
"JIT kernel do not support type: %d.", kt));
return "NOT JITKernel"; return "NOT JITKernel";
} }
return nullptr; return nullptr;
...@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) { ...@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
ONE_CASE(kAvg); ONE_CASE(kAvg);
ONE_CASE(kSqrt); ONE_CASE(kSqrt);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", tp); PADDLE_THROW(platform::errors::Unimplemented(
"SeqPool JIT kernel do not support type: %d.", tp));
return "NOT PoolType"; return "NOT PoolType";
} }
return nullptr; return nullptr;
...@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) { ...@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
} else if (lower == "tanh" || lower == "vtanh") { } else if (lower == "tanh" || lower == "vtanh") {
return kVTanh; return kVTanh;
} }
PADDLE_THROW("Not support type: %s, or forget to add this case", act); PADDLE_THROW(platform::errors::Unimplemented(
"Act JIT kernel do not support type: %s.", act));
return kNone; return kNone;
} }
...@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) { ...@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
int block, rest; int block, rest;
const auto groups = packed_groups(n, k, &block, &rest); const auto groups = packed_groups(n, k, &block, &rest);
std::for_each(groups.begin(), groups.end(), [&](int i) { std::for_each(groups.begin(), groups.end(), [&](int i) {
PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
"Each element of groups should be larger than "
"0. However the element: %d doesn't satify.",
i));
}); });
int sum = std::accumulate(groups.begin(), groups.end(), 0); int sum = std::accumulate(groups.begin(), groups.end(), 0);
std::memset(dst, 0, k * sum * block * sizeof(float)); std::memset(dst, 0, k * sum * block * sizeof(float));
PADDLE_ENFORCE_GE(sum * block, n, PADDLE_ENFORCE_GE(sum * block, n,
"The packed n should be equal to or larger than n"); platform::errors::InvalidArgument(
"The packed n (sum * block) should be equal to or "
"larger than n (matmul row size). "
"However, the packed n is %d and n is %d.",
sum * block, n));
const int block_len = sizeof(float) * block; const int block_len = sizeof(float) * block;
int n_offset = 0; int n_offset = 0;
...@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) { ...@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
template <typename T> template <typename T>
typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights( typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
const T* src, T* dst, int n, int k) { const T* src, T* dst, int n, int k) {
PADDLE_THROW("Only support pack with float type."); PADDLE_THROW(platform::errors::Unimplemented(
"Only supports pack weights with float type."));
} }
} // namespace jit } // namespace jit
......
...@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() { ...@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
auto& ref_pool = ReferKernelPool::Instance().AllKernels(); auto& ref_pool = ReferKernelPool::Instance().AllKernels();
KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
auto ref_iter = ref_pool.find(kkey); auto ref_iter = ref_pool.find(kkey);
PADDLE_ENFORCE(ref_iter != ref_pool.end(), PADDLE_ENFORCE_NE(
"Every Kernel should have reference function."); ref_iter, ref_pool.end(),
platform::errors::PreconditionNotMet(
"Every Refer Kernel of jitcode should have reference function."));
auto& ref_impls = ref_iter->second; auto& ref_impls = ref_iter->second;
for (auto& impl : ref_impls) { for (auto& impl : ref_impls) {
auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get()); auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
...@@ -101,7 +103,9 @@ template <typename KernelTuple> ...@@ -101,7 +103,9 @@ template <typename KernelTuple>
inline typename KernelTuple::func_type GetReferFunc() { inline typename KernelTuple::func_type GetReferFunc() {
auto ker = GetReferKernel<KernelTuple>(); auto ker = GetReferKernel<KernelTuple>();
auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker); auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
PADDLE_ENFORCE(p, "The Refer kernel should exsit"); PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument(
"Get the reference code of kernel in CPU "
"failed. The Refer kernel should exsit."));
return p->GetFunc(); return p->GetFunc();
} }
...@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels( ...@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
// The last implementation should be reference function on CPUPlace. // The last implementation should be reference function on CPUPlace.
auto ref = GetReferKernel<KernelTuple>(); auto ref = GetReferKernel<KernelTuple>();
PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument(
"Get all candicate kernel in CPU failed. "
"The Refer Kernel can not be empty."));
res.emplace_back(ref); res.emplace_back(ref);
return res; return res;
} }
...@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { ...@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
std::string name = k->ImplType(); std::string name = k->ImplType();
if (name == "JitCode") { if (name == "JitCode") {
auto i = dynamic_cast<const GenBase*>(k); auto i = dynamic_cast<const GenBase*>(k);
PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); PADDLE_ENFORCE_NOT_NULL(i,
platform::errors::InvalidArgument(
"Generate jitcode kernel (GenBase) failed."));
res.emplace_back(std::make_pair(name, i->template getCode<Func>())); res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
} else { } else {
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k); auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
PADDLE_ENFORCE(i, "kernel cast can not fail."); PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument(
"Kernel cast (KernelMore) failed."));
res.emplace_back(std::make_pair(name, i->GetFunc())); res.emplace_back(std::make_pair(name, i->GetFunc()));
} }
} }
...@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace> ...@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
typename KernelTuple::func_type GetDefaultBestFunc( typename KernelTuple::func_type GetDefaultBestFunc(
const typename KernelTuple::attr_type& attr) { const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr); auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
PADDLE_ENFORCE_GE(funcs.size(), 1UL); PADDLE_ENFORCE_GE(funcs.size(), 1UL,
platform::errors::InvalidArgument(
"The candicate jit kernel is at least one in CPU."));
// Here could do some runtime benchmark of this attr and return the best one. // Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one, // But yet just get the first one as the default best one,
// which is searched in order and tuned by offline. // which is searched in order and tuned by offline.
......
...@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT ...@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
} else if (type == kVIdentity) { } else if (type == kVIdentity) {
return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d); return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
} }
PADDLE_THROW("Not support type: %s", type); PADDLE_THROW(platform::errors::Unimplemented(
"Act JIT kernel do not support type: %s", type));
return nullptr; return nullptr;
} }
......
...@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { ...@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
template <typename T> template <typename T>
void EmbSeqPool(const T* table, const int64_t* idx, T* out, void EmbSeqPool(const T* table, const int64_t* idx, T* out,
const emb_seq_pool_attr_t* attr) { const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); PADDLE_ENFORCE_EQ(
attr->table_width * attr->index_width, attr->out_width,
platform::errors::InvalidArgument(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d, "
"out_width is %d.",
attr->table_width * attr->index_width, attr->out_width));
auto check_idx_value_valid = [&](int64_t i) { auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", PADDLE_ENFORCE_LT(
idx[i], i); idx[i], attr->table_height,
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); platform::errors::InvalidArgument(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d.",
i, idx[i], attr->table_height));
PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d.",
i, idx[i]));
}; };
for (int64_t w = 0; w != attr->index_width; ++w) { for (int64_t w = 0; w != attr->index_width; ++w) {
...@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { ...@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
template <typename T> template <typename T>
void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
T* out, const sgd_attr_t* attr) { T* out, const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); platform::errors::InvalidArgument(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d.",
attr->param_width, attr->grad_width));
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d.",
attr->selected_rows_size, attr->grad_height));
T scalar = -lr[0]; T scalar = -lr[0];
int width = attr->grad_width; int width = attr->grad_width;
if (out == param) { if (out == param) {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i]; auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height); PADDLE_ENFORCE_LT(h_idx, attr->param_height,
PADDLE_ENFORCE_GE(h_idx, 0); platform::errors::InvalidArgument(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d.",
i, h_idx, attr->param_height));
PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d.",
i, h_idx));
VAXPY(scalar, grad + i * width, out + h_idx * width, width); VAXPY(scalar, grad + i * width, out + h_idx * width, width);
} }
} else { } else {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i]; auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height); PADDLE_ENFORCE_LT(h_idx, attr->param_height,
PADDLE_ENFORCE_GE(h_idx, 0); platform::errors::InvalidArgument(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d.",
i, h_idx, attr->param_height));
PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d.",
i, h_idx));
VScal(&scalar, grad + i * width, out + h_idx * width, width); VScal(&scalar, grad + i * width, out + h_idx * width, width);
VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width, VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
width); width);
......
...@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT ...@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT
} else if (type == kVIdentity) { } else if (type == kVIdentity) {
return VIdentity<T>; return VIdentity<T>;
} }
PADDLE_THROW("Not support type: %s", type); PADDLE_THROW(platform::errors::Unimplemented(
"Act JIT kernel do not support type: %s.", type));
return nullptr; return nullptr;
} }
...@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) { ...@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
template <typename T> template <typename T>
void EmbSeqPool(const T* table, const int64_t* idx, T* out, void EmbSeqPool(const T* table, const int64_t* idx, T* out,
const emb_seq_pool_attr_t* attr) { const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); PADDLE_ENFORCE_EQ(
attr->table_width * attr->index_width, attr->out_width,
platform::errors::InvalidArgument(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d and "
"out_width is %d.",
attr->table_width * attr->index_width, attr->out_width));
auto check_idx_value_valid = [&](int64_t i) { auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", PADDLE_ENFORCE_LT(
idx[i], i); idx[i], attr->table_height,
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); platform::errors::InvalidArgument(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d.",
i, idx[i], attr->table_height));
PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d.",
i, idx[i]));
}; };
for (int64_t w = 0; w != attr->index_width; ++w) { for (int64_t w = 0; w != attr->index_width; ++w) {
...@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, ...@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
template <typename T> template <typename T>
void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
T* out, const sgd_attr_t* attr) { T* out, const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); platform::errors::InvalidArgument(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d.",
attr->param_width, attr->grad_width));
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d.",
attr->selected_rows_size, attr->grad_height));
for (int64_t i = 0; i < attr->selected_rows_size; ++i) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i]; auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height); PADDLE_ENFORCE_LT(h_idx, attr->param_height,
PADDLE_ENFORCE_GE(h_idx, 0); platform::errors::InvalidArgument(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d.",
i, h_idx, attr->param_height));
PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d.",
i, h_idx));
for (int64_t j = 0; j < attr->grad_width; ++j) { for (int64_t j = 0; j < attr->grad_width; ++j) {
out[h_idx * attr->grad_width + j] = out[h_idx * attr->grad_width + j] =
param[h_idx * attr->grad_width + j] - param[h_idx * attr->grad_width + j] -
......
...@@ -850,8 +850,15 @@ void TestKernelSgd() { ...@@ -850,8 +850,15 @@ void TestKernelSgd() {
const T lr = 0.1; const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower, auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> { const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1); PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1,
PADDLE_ENFORCE_GT(n, 0); paddle::platform::errors::InvalidArgument(
"The range of Sgd (upper - lower) should be lower "
"than n-1 (Sgd size -1). But the upper - lower is %d "
"and n-1 is %d.",
static_cast<size_t>(upper - lower), n - 1));
PADDLE_ENFORCE_GT(
n, 0, paddle::platform::errors::InvalidArgument(
"The Sgd size should be larger than 0. But the n is %d.", n));
std::vector<int64_t> all, out; std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
all.push_back(i); all.push_back(i);
......
...@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N, ...@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
}); });
} }
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMV(
bool trans_a, int M, int N, platform::float16 alpha,
const platform::float16 *A, const platform::float16 *B,
platform::float16 beta, platform::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
alpha, B, A, beta, C);
} else {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
alpha, A, B, beta, C);
}
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM( void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
} }
} }
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 **A,
const platform::float16 **B, platform::float16 beta, platform::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
B[k], beta, C[k]);
}
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
......
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include <algorithm>
#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/segment_pooling.h" #include "paddle/fluid/operators/math/segment_pooling.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_launch_param_config.h" #include "paddle/fluid/platform/gpu_launch_param_config.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -100,7 +99,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input, ...@@ -100,7 +99,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) { CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
Index segment_offset, dim_index_base, actual_height; Index segment_offset, dim_index_base, actual_height;
Index inner_dim_size = h.inner_dim_size; Index inner_dim_size = h.inner_dim_size;
h.calculate(stripe_index, segment_offset, dim_index_base, actual_height); h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
T minmax = pool.initial(); T minmax = pool.initial();
Index first_segment_id = segment_ids[dim_index_base]; Index first_segment_id = segment_ids[dim_index_base];
...@@ -154,7 +153,7 @@ __global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input, ...@@ -154,7 +153,7 @@ __global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
T* in_grad, Helper h) { T* in_grad, Helper h) {
CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) { CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
Index segment_offset, dim_index_base, actual_height; Index segment_offset, dim_index_base, actual_height;
h.calculate(stripe_index, segment_offset, dim_index_base, actual_height); h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
for (Index j = 0; j < actual_height; j++) { for (Index j = 0; j < actual_height; j++) {
Index current_segment_id = segment_ids[dim_index_base + j]; Index current_segment_id = segment_ids[dim_index_base + j];
...@@ -217,11 +216,11 @@ class ArrangeHelper { ...@@ -217,11 +216,11 @@ class ArrangeHelper {
total_stripe_count = inner_dim_size * input_outer_dim_num_stripe; total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
} }
DEVICE inline void calculate(T stripe_index, T& segment_offset, DEVICE inline void calculate(T stripe_index, T* segment_offset,
T& dim_index_base, T& actual_height) { T* dim_index_base, T* actual_height) {
segment_offset = stripe_index % inner_dim_size; *segment_offset = stripe_index % inner_dim_size;
dim_index_base = stripe_index / inner_dim_size * DimTileSize; *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
actual_height = min(DimTileSize, input_length_size - dim_index_base); *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
} }
}; };
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
if (x_dim.size() > 1) {
return x_dim;
}
return framework::make_ddim({1, x_dim[0]});
}
static framework::Tensor FoldInitDims(const framework::Tensor &input) {
auto output = input;
auto in_dims = input.dims();
if (in_dims.size() == 3) {
output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
}
return output;
}
/**
* Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
* original y_dim is returned.
*/
static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
if (y_dim.size() > 1) {
return y_dim;
}
return framework::make_ddim({y_dim[0], 1});
}
static void ReshapeTensorIntoMatrixSequence(
framework::Tensor *x, const math::MatDescriptor &descriptor) {
int64_t h, w;
h = descriptor.height_;
w = descriptor.width_;
if (descriptor.trans_) {
std::swap(w, h);
}
if (descriptor.batch_size_) {
x->Resize({descriptor.batch_size_, h, w});
} else {
x->Resize({h, w});
}
}
/**
* Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
* Out = matmul(x, y)
*
* This method will first calculate X,Y matrix sequence, and then calculate
* the out shape.
*
* Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
* The out = [BatchSize, H1, W2]
*
* If there is no batch size in `X` and `Y`, the out will be [H1, W2]
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* BatchSize.
*/
static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
framework::Tensor *y,
framework::Tensor *out, bool trans_x,
bool trans_y) {
auto x_dim = RowMatrixFromVector(x->dims());
auto y_dim = ColumnMatrixFromVector(y->dims());
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else {
out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
mat_dim_x.height_, mat_dim_y.width_});
}
ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
}
template <typename DeviceContext, typename T>
class MatMulXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *x = context.Input<framework::Tensor>("X");
auto *y = context.Input<framework::Tensor>("Y");
auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto mat_dim_a = math::CreateMatrixDescriptor(
RowMatrixFromVector(x->dims()), 0, context.Attr<bool>("transpose_X"));
auto mat_dim_b =
math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0,
context.Attr<bool>("transpose_Y"));
PADDLE_ENFORCE_EQ(
mat_dim_a.width_, mat_dim_b.height_,
platform::errors::InvalidArgument("Shape mistake in matmul_op"));
PADDLE_ENFORCE_EQ(
mat_dim_a.batch_size_, mat_dim_b.batch_size_,
platform::errors::InvalidArgument("Shape mistake in matmul_op"));
T alpha = static_cast<T>(context.Attr<float>("alpha"));
auto &dev_ctx = context.template device_context<DeviceContext>();
float *data_c = out->data<T>();
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
int r =
xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
alpha, x->data<T>(), y->data<T>(), 0.0f, data_c);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else {
// batch matmul
int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
mat_dim_b.trans_, mat_dim_a.batch_size_,
mat_dim_a.height_, mat_dim_b.width_,
mat_dim_a.width_, alpha, x->data<T>(),
y->data<T>(), data_c, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
}
};
// Reshape a rank-3 tensor from P x M x N to M x (P * N).
// (Warning: This requires transposing data and writes into new memory.)
// Identity op if the tensor is not of rank 3.
template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext &context, const framework::Tensor &input) {
auto in_dims = input.dims();
if (in_dims.size() != 3) {
return input;
}
framework::Tensor output;
output.Resize({in_dims[1], in_dims[0], in_dims[2]});
output.mutable_data<T>(context.GetPlace());
std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
static_cast<int>(in_dims[1]),
static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
in_shape_host.data(), axis_host.data(), /*ndims=*/3);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
return output;
}
// Using dimensional constraints on matrix multiplication, it is
// straight-forward to check the following table for when X and Y
// are both matrices.
//
// transpose_X | False | True | False | True
// transpose_Y | False | False | True | True
// -----------+----------+----------+----------+-----------
// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T
// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T
//
// When X is a vector of size K, we treat it instead as a matrix of shape
// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
// a matrix of shape (K, 1).
//
// When X and Y are both 3-dimensional tensors, then the first dimension
// the batch dimension can be ignored and the exact same formulas apply
// as for two matrices.
//
// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
// up with formulas like
//
// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
//
// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
// to X: (P * M) x K, dOut: (P * M) x N.
template <typename DeviceContext, typename T>
class MatMulGradXPUKernel : public framework::OpKernel<T> {
public:
void MatMul(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a,
const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace());
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
PADDLE_ENFORCE_EQ(
mat_dim_a.width_, mat_dim_b.height_,
platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
PADDLE_ENFORCE_EQ(
mat_dim_a.batch_size_, mat_dim_b.batch_size_,
platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
T alpha = static_cast<T>(context.Attr<float>("alpha"));
auto &dev_ctx = context.template device_context<DeviceContext>();
float *data_c = out->data<T>();
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
int r =
xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
alpha, a.data<T>(), b.data<T>(), 0.0f, data_c);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else {
// batch matmul
int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
mat_dim_b.trans_, mat_dim_a.batch_size_,
mat_dim_a.height_, mat_dim_b.width_,
mat_dim_a.width_, alpha, a.data<T>(),
b.data<T>(), data_c, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
}
void CalcInputGrad(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a,
bool is_fold_init_dims_a, const framework::Tensor &b,
bool trans_b, bool is_fold_init_dims_b,
framework::Tensor *out) const {
if (out == nullptr) return;
bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
out->dims().size() == 2;
if (!need_combine) {
MatMul(context, a, trans_a, b, trans_b, out);
} else {
auto &dev_ctx = context.template device_context<DeviceContext>();
MatMul(
context, is_fold_init_dims_a
? FoldInitDims(a)
: XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
trans_a, is_fold_init_dims_b
? FoldInitDims(b)
: XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
trans_b, out);
}
}
void Compute(const framework::ExecutionContext &context) const override {
auto x = *context.Input<framework::Tensor>("X");
auto y = *context.Input<framework::Tensor>("Y");
auto dout =
*context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
bool transpose_x = context.Attr<bool>("transpose_X");
bool transpose_y = context.Attr<bool>("transpose_Y");
ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
framework::DDim dx_dims;
if (dx) {
dx_dims = dx->dims();
if (dx_dims != x.dims()) {
dx->Resize(x.dims());
}
}
framework::DDim dy_dims;
if (dy) {
dy_dims = dy->dims();
if (dy_dims != y.dims()) {
dy->Resize(y.dims());
}
}
if (transpose_x && transpose_y) {
CalcInputGrad(context, y, true, true, dout, true, false, dx);
CalcInputGrad(context, dout, true, true, x, true, false, dy);
} else if (transpose_x) {
CalcInputGrad(context, y, false, false, dout, true, false, dx);
CalcInputGrad(context, x, false, false, dout, false, true, dy);
} else if (transpose_y) {
CalcInputGrad(context, dout, false, false, y, false, true, dx);
CalcInputGrad(context, dout, true, true, x, false, true, dy);
} else {
CalcInputGrad(context, dout, false, false, y, true, false, dx);
CalcInputGrad(context, x, true, true, dout, false, true, dy);
}
if (dx) {
if (dx_dims != x.dims()) {
dx->Resize(dx_dims);
}
}
if (dy) {
if (dy_dims != y.dims()) {
dy->Resize(dy_dims);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
matmul_grad,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
#endif
...@@ -17,10 +17,12 @@ limitations under the License. */ ...@@ -17,10 +17,12 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plf = paddle::platform; namespace plf = paddle::platform;
REGISTER_OP_CUDA_KERNEL(matmul_v2, REGISTER_OP_CUDA_KERNEL(
ops::MatMulV2Kernel<plf::CUDADeviceContext, float>, matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
ops::MatMulV2Kernel<plf::CUDADeviceContext, double>); ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>, matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>); ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>);
...@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if (trans_y) { if (trans_y) {
const int M = Y->numel() / N; const int M = Y->numel() / N;
VLOG(3) << "MatMul's case 2"; VLOG(3) << "MatMul's case 2";
blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>()); blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
static_cast<T>(0), Out->data<T>());
} else { } else {
const int M = y_dims[y_ndim - 1]; const int M = y_dims[y_ndim - 1];
const int batch_size = Y->numel() / (M * N); const int batch_size = Y->numel() / (M * N);
if (batch_size == 1) { if (batch_size == 1) {
VLOG(3) << "MatMul's case 3"; VLOG(3) << "MatMul's case 3";
blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>()); blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 4"; VLOG(3) << "MatMul's case 4";
blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data, blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
x_data, 0, Out->data<T>(), batch_size, M * N, 0); y_data, x_data, static_cast<T>(0), Out->data<T>(),
batch_size, M * N, 0);
} }
} }
return; return;
...@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
const int batch_size = X->numel() / (M * N); const int batch_size = X->numel() / (M * N);
if (batch_size == 1) { if (batch_size == 1) {
VLOG(3) << "MatMul's case 5"; VLOG(3) << "MatMul's case 5";
blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>()); blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 6"; VLOG(3) << "MatMul's case 6";
blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data, blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
y_data, 0, Out->data<T>(), batch_size, M * N, 0); x_data, y_data, static_cast<T>(0), Out->data<T>(),
batch_size, M * N, 0);
} }
} else { } else {
const int M = X->numel() / N; const int M = X->numel() / N;
VLOG(3) << "MatMul's case 7"; VLOG(3) << "MatMul's case 7";
blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>()); blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
static_cast<T>(0), Out->data<T>());
} }
return; return;
} }
...@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if (x_batch_size == 1 && y_batch_size == 1) { if (x_batch_size == 1 && y_batch_size == 1) {
VLOG(3) << "MatMul's case 8"; VLOG(3) << "MatMul's case 8";
blas.GEMM(trans_x ? CblasTrans : CblasNoTrans, blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data, trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
y_data, 0.0f, Out->data<T>()); x_data, y_data, static_cast<T>(0), Out->data<T>());
} else if (x_batch_size == 1) { } else if (x_batch_size == 1) {
if (M == 1 && trans_y) { if (M == 1 && trans_y) {
VLOG(3) << "MatMul's case 9"; VLOG(3) << "MatMul's case 9";
blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f, blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
Out->data<T>()); static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 10"; VLOG(3) << "MatMul's case 10";
blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
x_data, y_data, 0, Out->data<T>(), out_batch_size, 0, static_cast<T>(1), x_data, y_data, static_cast<T>(0),
K * N); Out->data<T>(), out_batch_size, 0, K * N);
} }
} else if (y_batch_size == 1) { } else if (y_batch_size == 1) {
if (!trans_x) { if (!trans_x) {
VLOG(3) << "MatMul's case 11"; VLOG(3) << "MatMul's case 11";
blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans, blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f, x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
Out->data<T>()); static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 12"; VLOG(3) << "MatMul's case 12";
blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K, blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size, static_cast<T>(1), x_data, y_data, static_cast<T>(0),
M * K, 0); Out->data<T>(), out_batch_size, M * K, 0);
} }
} else if (!is_broadcast_dims) { } else if (!is_broadcast_dims) {
VLOG(3) << "MatMul's case 13"; VLOG(3) << "MatMul's case 13";
blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N); static_cast<T>(1), x_data, y_data, static_cast<T>(0),
Out->data<T>(), out_batch_size, M * K, K * N);
} else { } else {
// in the case, can't use stridedgemm // in the case, can't use stridedgemm
std::vector<const T*> x_ptr(out_batch_size); std::vector<const T*> x_ptr(out_batch_size);
...@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
} }
VLOG(3) << "MatMul's case 14"; VLOG(3) << "MatMul's case 14";
blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(), static_cast<T>(1), x_ptr.data(), y_ptr.data(),
out_batch_size); static_cast<T>(0), out_ptr.data(), out_batch_size);
} }
} }
......
...@@ -55,12 +55,12 @@ inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, ...@@ -55,12 +55,12 @@ inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format,
} }
} }
static mkldnn::memory::data_type GetDstType(bool is_int8, static mkldnn::memory::data_type GetDstType(bool is_int8, bool is_bfloat16,
bool force_fp32_output, bool force_fp32_output,
std::string fuse_activation, std::string fuse_activation,
bool fuse_residual_conn, bool fuse_residual_conn,
const Tensor* residual_param) { const Tensor* residual_param) {
auto dst_dt = mkldnn::memory::data_type::f32; // uint8_t, int8_t, float auto dst_dt = mkldnn::memory::data_type::f32;
if (is_int8) { if (is_int8) {
dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6") dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
? mkldnn::memory::data_type::u8 ? mkldnn::memory::data_type::u8
...@@ -72,6 +72,13 @@ static mkldnn::memory::data_type GetDstType(bool is_int8, ...@@ -72,6 +72,13 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
auto residual_dt = framework::ToMKLDNNDataType(residual_param->type()); auto residual_dt = framework::ToMKLDNNDataType(residual_param->type());
if (dst_dt != residual_dt) dst_dt = residual_dt; if (dst_dt != residual_dt) dst_dt = residual_dt;
} }
} else {
if (!force_fp32_output && is_bfloat16) {
dst_dt = mkldnn::memory::data_type::bf16;
if (fuse_residual_conn && residual_param) {
dst_dt = framework::ToMKLDNNDataType(residual_param->type());
}
}
} }
return dst_dt; return dst_dt;
} }
...@@ -224,12 +231,15 @@ class ConvMKLDNNHandlerT ...@@ -224,12 +231,15 @@ class ConvMKLDNNHandlerT
src_tz.size(), chosen_memory_format); src_tz.size(), chosen_memory_format);
} }
} }
auto data_type = mkldnn::memory::data_type::f32;
const auto src_md = platform::MKLDNNMemDesc( if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); std::is_same<T_out, platform::bfloat16>::value)
const auto weights_md = data_type = mkldnn::memory::data_type::bf16;
platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any); const auto src_md =
platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type,
MKLDNNMemoryFormat::any);
const auto dst_md = platform::MKLDNNMemDesc( const auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
...@@ -241,8 +251,8 @@ class ConvMKLDNNHandlerT ...@@ -241,8 +251,8 @@ class ConvMKLDNNHandlerT
if (bias) { if (bias) {
auto bias_tz = framework::vectorize(bias->dims()); auto bias_tz = framework::vectorize(bias->dims());
auto bias_md = platform::MKLDNNMemDesc( auto bias_md =
bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x); platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
this->AcquireForwardPrimitiveDescriptor( this->AcquireForwardPrimitiveDescriptor(
conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
...@@ -384,15 +394,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -384,15 +394,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
"Operator DNNL Conv must use CPUPlace")); "Operator DNNL Conv must use CPUPlace"));
bool is_INT8 = bool is_INT8 =
std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value; std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
bool is_BFLOAT16 = ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16";
auto residual_param = ctx.Input<Tensor>("ResidualData");
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
auto dst_dt =
GetDstType(is_INT8, is_BFLOAT16, force_fp32_output, fuse_activation,
fuse_residual_conn, residual_param);
if (!is_INT8) { if (!is_INT8) {
ComputeFP32<float>(ctx); if (dst_dt == mkldnn::memory::data_type::f32) {
ComputeFP32<float>(ctx);
} else if (dst_dt == mkldnn::memory::data_type::bf16) {
ComputeFP32<platform::bfloat16>(ctx);
}
} else { } else {
std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
auto residual_param = ctx.Input<Tensor>("ResidualData");
auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
fuse_residual_conn, residual_param);
if (dst_dt == mkldnn::memory::data_type::f32) { if (dst_dt == mkldnn::memory::data_type::f32) {
ComputeINT8<float>(ctx); ComputeINT8<float>(ctx);
} else if (dst_dt == mkldnn::memory::data_type::u8) { } else if (dst_dt == mkldnn::memory::data_type::u8) {
...@@ -1103,6 +1119,10 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ...@@ -1103,6 +1119,10 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
ops::kConvMKLDNNFP32, ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float, float>); ops::ConvMKLDNNOpKernel<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
conv2d, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<paddle::platform::bfloat16, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, U8, ::paddle::platform::CPUPlace, U8,
ops::kConvMKLDNNINT8, ops::kConvMKLDNNINT8,
......
...@@ -110,4 +110,5 @@ class DeQuantOpKernel : public framework::OpKernel<T> { ...@@ -110,4 +110,5 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>); ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>,
ops::DeQuantOpKernel<paddle::platform::bfloat16>);
...@@ -14,11 +14,11 @@ limitations under the License. */ ...@@ -14,11 +14,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/mul_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/operators/mul_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout; ...@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy, // np.sum(dy,
// axis=(n,h,w)) * (x - mean) * // axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var // inv_var
// * // *
// np.mean(dy, axis=(n,h,w)) - // np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))) // axis=(n,h,w)))
template <typename T, int BlockDim, framework::DataLayout layout> template <typename T, int BlockDim, framework::DataLayout layout>
__global__ void DoubleGradComputeDX(const T *x, const T *mean, __global__ void DoubleGradComputeDX(const T *x, const T *mean,
...@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean, ...@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
? (j / sample_size * C + i) * sample_size + j % sample_size ? (j / sample_size * C + i) * sample_size + j % sample_size
: j * outer_size + i; : j * outer_size + i;
dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val - dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
(x[index] - mean_val) * var_val * (x[index] - mean_val) * var_val * var_val *
dy_mul_x_sub_mean_sum_val * var_val / inner_size) * dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
ddscale[i]; ddscale[i];
} }
...@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal( ...@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
} }
// math: dx = ddscale * dy * inv_var // math: dx = ddscale * dy * inv_var
// math: ddy = scale * ddx * inv_var
template <typename T, framework::DataLayout layout> template <typename T, framework::DataLayout layout>
__global__ void DoubleGradComputeDataWithGlobal( __global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale,
const T *dy, const T *scale, const T *variance, const double epsilon, const T *variance,
const int C, const int sample_size, const int num, T *dx) { const double epsilon, const int C,
const int sample_size,
const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x; int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
if (scale != nullptr) { if (ddscale != nullptr) {
for (int i = gid; i < num; i += stride) { for (int i = gid; i < num; i += stride) {
const int c = const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C; layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
T inv_var = 1.0 / sqrt(variance[c] + epsilon); T inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = dy[i] * scale[c] * inv_var; dx[i] = dy[i] * ddscale[c] * inv_var;
}
}
}
// math: ddy = scale * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
template <typename T, framework::DataLayout layout>
__global__ void DoubleGradComputeDDYWithGlobal(
const T *ddx, const T *scale, const T *mean, const T *variance, const T *x,
const T *ddbias, const T *ddscale, const double epsilon, const int C,
const int sample_size, const int num, T *ddy) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
if (ddx != nullptr) {
for (int i = gid; i < num; i += stride) {
const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
T inv_var = 1.0 / sqrt(variance[c] + epsilon);
ddy[i] += ddx[i] * scale[c] * inv_var;
}
}
__syncthreads();
if (ddscale != nullptr) {
for (int i = gid; i < num; i += stride) {
const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
T inv_var = 1.0 / sqrt(variance[c] + epsilon);
ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c];
}
}
__syncthreads();
if (ddbias != nullptr) {
for (int i = gid; i < num; i += stride) {
const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
ddy[i] += ddbias[c];
} }
} }
} }
...@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const T *mean_data, *variance_data; const T *mean_data, *variance_data;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance"); const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean_data = running_mean->template data<T>();
const auto *running_var_data = running_var->template data<T>(); const auto *running_var_data = running_var->template data<T>();
mean_data = running_mean_data;
variance_data = running_var_data; variance_data = running_var_data;
} else { } else {
const T *smean_data = Saved_mean->data<T>(); const T *smean_data = Saved_mean->data<T>();
...@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant(dev_ctx, dX, static_cast<T>(0)); set_constant(dev_ctx, dX, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} else { } else {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
...@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant(dev_ctx, ddY, static_cast<T>(0)); set_constant(dev_ctx, ddY, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
ddx_data, scale_data, variance_data, epsilon, C, sample_size, num, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} else { } else {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
ddx_data, scale_data, variance_data, epsilon, C, sample_size, num, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
......
...@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel { ...@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of DpsgdOp should not be null."); platform::errors::NotFound(
"Input(Param) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
"Input(Grad) of DpsgdOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, "Input(Grad) of DpsgdOp should not be null."));
"Input(LearningRate) of DpsgdOp should not be null."); PADDLE_ENFORCE_EQ(
ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(), ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::InvalidArgument(
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); "The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front(), ctx->GetInputsVarType("Grad").front(),
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::InvalidArgument(
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); "The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Grad").front()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
"Output(ParamOut) of DpsgdOp should not be null."); platform::errors::NotFound(
"Output(ParamOut) of DpsgdOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 dimension"); platform::errors::InvalidArgument(
"Learning rate should have 1 dimension. But Received "
"LearningRate's dims [%s].",
framework::product(lr_dims)));
auto param_dims = ctx->GetInputDim("Param"); auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"), param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of DpsgdOp should have same dimension"); platform::errors::InvalidArgument(
"Param and Grad input of DpsgdOp should have same dimension. But "
"received Para's dim [%s] and Grad's dim [%s].",
param_dims, ctx->GetInputDim("Grad")));
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
} }
......
...@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> { ...@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
const auto *param_var = ctx.InputVar("Param"); const auto *param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true, PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Param").front(), "but the received is %s",
framework::ToTypeName(param_var->Type())); ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
const auto *grad_var = ctx.InputVar("Grad"); const auto *grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true, PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Grad").front(), "but the received is %s",
framework::ToTypeName(grad_var->Type())); ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate"); const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
......
...@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(param) of Momentum should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(param) of Momentum should not be null."));
"Input(grad) of Momentum should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
PADDLE_ENFORCE(ctx->HasInput("Velocity"), platform::errors::NotFound(
"Input(velocity) of Momentum should not be null."); "Input(grad) of Momentum should not be null."));
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true,
"Input(LearningRate) of Momentum should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE( "Input(velocity) of Momentum should not be null."));
ctx->GetInputsVarType("Param").front() == PADDLE_ENFORCE_EQ(
framework::proto::VarType::LOD_TENSOR, ctx->HasInput("LearningRate"), true,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::NotFound(
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); "Input(LearningRate) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), ctx->GetInputsVarType("Param").front(),
"Output(ParamOut) of Momentum should not be null."); framework::proto::VarType::LOD_TENSOR,
PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), platform::errors::InvalidArgument(
"Output(VelocityOut) of Momentum should not be null."); "The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("VelocityOut"), true,
platform::errors::NotFound(
"Output(VelocityOut) of Momentum should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not " platform::errors::InvalidArgument(
"been initialized. You may need to confirm " "Maybe the Input variable LearningRate has not "
"if you put exe.run(startup_program) " "been initialized. You may need to confirm "
"after optimizer.minimize function."); "if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning_rate should be a scalar"); platform::errors::InvalidArgument(
"Learning_rate should be a scalar. But Received "
"LearningRate's dim [%s]",
framework::product(lr_dims)));
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] == if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) { framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"Param and Grad input of MomentumOp should have the same dimension."); platform::errors::InvalidArgument(
"Param and Grad input of MomentumOp should have the same "
"dimension. But received Param's dim [%s] and Grad's dim [%s].",
param_dim, ctx->GetInputDim("Grad")));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Velocity"), param_dim, ctx->GetInputDim("Velocity"),
"Param and Velocity of MomentumOp should have the same dimension."); platform::errors::InvalidArgument(
"Param and Velocity of MomentumOp should have the same "
"dimension. But received Param's dim [%s] and Velocity [%s].",
param_dim, ctx->GetInputDim("Velocity")));
} }
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
...@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> {
for_range(functor); for_range(functor);
} }
} else { } else {
PADDLE_THROW( PADDLE_ENFORCE_EQ(false, true,
string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " platform::errors::PermissionDenied(
"gradient, but the received Variable Type is %s", "Unsupported Variable Type of Grad "
framework::ToTypeName(grad_var->Type()))); "in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} }
}; };
......
...@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of RmspropOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("MeanSquare"), "Input(Param) of RmspropOp should not be null."));
"Input(MeanSquare) of RmspropOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), ctx->HasInput("MeanSquare"), true,
"Input(LearningRate) of RmspropOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(MeanSquare) of RmspropOp should not be null."));
"Input(Grad) of RmspropOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("Moment"), ctx->HasInput("LearningRate"), true,
"Input(Moment) of RmspropOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE( "Input(LearningRate) of RmspropOp should not be null."));
ctx->GetInputsVarType("Param").front() == PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
framework::proto::VarType::LOD_TENSOR, platform::errors::NotFound(
"The input var's type should be LoDTensor, but the received is %s", "Input(Grad) of RmspropOp should not be null."));
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true,
platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Input(Moment) of RmspropOp should not be null."));
"Output(param_out) of RmspropOp should not be null."); PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), framework::proto::VarType::LOD_TENSOR,
"Output(MomentOut) of RmspropOp should not be null."); platform::errors::InvalidArgument(
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), "The input var's type in RmspropOp should be "
"Output(MeanSquareOut) of RmspropOp should not be null."); "LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(param_out) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MomentOut"), true,
platform::errors::NotFound(
"Output(MomentOut) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MeanSquareOut"), true,
platform::errors::NotFound(
"Output(MeanSquareOut) of RmspropOp should not be null."));
if (ctx->Attrs().Get<bool>("centered")) { if (ctx->Attrs().Get<bool>("centered")) {
PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"), PADDLE_ENFORCE_EQ(
"Output(MeanGradOut) of RmspropOp should not be null."); ctx->HasOutput("MeanGradOut"), true,
platform::errors::NotFound(
"Output(MeanGradOut) of RmspropOp should not be null."));
} }
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"Param and grad input of RmspropOp should have the same dimension."); platform::errors::InvalidArgument(
"Param and grad input of RmspropOp should have the same dimension. "
"But received Param's dim [%s] and Grad's dim [%s].",
param_dim, ctx->GetInputDim("Grad")));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
"Param and Momentum input of RmspropOp " platform::errors::InvalidArgument(
"should have the same dimension."); "Param and Momentum input of RmspropOp "
"should have the same dimension. But received "
"Param's dim [%s] and Moment [%s]",
param_dim, ctx->GetInputDim("Moment")));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
"Param and Momentum input of RmspropOp " platform::errors::InvalidArgument(
"should have the same dimension."); "Param and Momentum input of RmspropOp "
"should have the same dimension. But received "
"Param's dim [%s] and MeanSquare [%s]",
param_dim, ctx->GetInputDim("MeanSquare")));
auto lr_dim = ctx->GetInputDim("LearningRate"); auto lr_dim = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
"Learning Rate should be a scalar."); platform::errors::InvalidArgument(
"Learning Rate of RmspropOp should be a scalar. But "
"received LearningRate's dim [%s]",
framework::product(lr_dim)));
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim);
......
...@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto &mom_tensor = *ctx.Input<LoDTensor>("Moment"); auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
PADDLE_ENFORCE_EQ(&p_tensor, param_out, PADDLE_ENFORCE_EQ(&p_tensor, param_out,
"Param and ParamOut must be the same Tensor"); platform::errors::InvalidArgument(
"Param and ParamOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(&mom_tensor, moment_out, PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
"Moment and MomentOut must be the same Tensor"); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out, "Moment and MomentOut must be the same Tensor"));
"MeanSquare and MeanSquareOut must be the same Tensor"); PADDLE_ENFORCE_EQ(
&ms_tensor, mean_square_out,
platform::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto &dev_ctx = ctx.template device_context<DeviceContext>();
size_t limit = static_cast<size_t>(ms_tensor.numel()); size_t limit = static_cast<size_t>(ms_tensor.numel());
...@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor); auto mg = EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, PADDLE_ENFORCE_EQ(
"MeanGrad and MeanGradOut must be the same Tensor"); &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out); auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g; mg_out.device(place) = rho * mg + (1 - rho) * g;
...@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
if (centered) { if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, PADDLE_ENFORCE_EQ(
"MeanGrad and MeanGradOut must be the same Tensor"); &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>( for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()), mean_square_out->mutable_data<T>(ctx.GetPlace()),
...@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
if (centered) { if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, PADDLE_ENFORCE_EQ(
"MeanGrad and MeanGradOut must be the same Tensor"); &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>( for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()), mean_square_out->mutable_data<T>(ctx.GetPlace()),
...@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> {
rho, epsilon, momentum, grad_func)); rho, epsilon, momentum, grad_func));
} }
} else { } else {
PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient"); PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in RmspropOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} }
}; };
......
...@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of SGDOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(Param) of SGDOp should not be null."));
"Input(Grad) of SGDOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), ctx->HasInput("Grad"), true,
"Input(LearningRate) of SGDOp should not be null."); platform::errors::NotFound("Input(Grad) of SGDOp should not be null."));
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
"Output(ParamOut) of SGDOp should not be null."); platform::errors::NotFound(
"Input(LearningRate) of SGDOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of SGDOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not " platform::errors::NotFound(
"been initialized. You may need to confirm " "Maybe the Input variable LearningRate has not "
"if you put exe.run(startup_program) " "been initialized. You may need to confirm "
"after optimizer.minimize function."); "if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element"); platform::errors::InvalidArgument(
"Learning rate should have 1 element. But received "
"LearningRate dims [%s]",
framework::product(lr_dims)));
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] == if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) { framework::proto::VarType::LOD_TENSOR) {
......
...@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T> ...@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param"); const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Param").front(), "but the received is %s",
framework::ToTypeName(param_var->Type())); ctx.InputNames("Param").front(),
paddle::framework::ToTypeName(param_var->Type())));
auto* param = ctx.Input<framework::Tensor>("Param"); auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut"); auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
...@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T> ...@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
// TODO(qijun): In Sparse SGD operator, in-place update is enforced. // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency. // This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution. // It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out); PADDLE_ENFORCE_EQ(
param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp should be equal with ParamOut "
"if variable's type is SelectedRows."));
auto* grad = ctx.Input<framework::SelectedRows>("Grad"); auto* grad = ctx.Input<framework::SelectedRows>("Grad");
auto in_height = grad->height(); auto in_height = grad->height();
auto out_dims = param_out->dims(); auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp should be "
"equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
in_height, out_dims[0]));
auto& in_value = grad->value(); auto& in_value = grad->value();
auto& in_rows = grad->rows(); auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height,
platform::errors::InvalidArgument(
"The in_row_numel of SgdOp should be equal with "
"param_out's numel / in_height."));
auto* in_data = in_value.data<T>(); auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>(); auto* out_data = param_out->data<T>();
...@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T> ...@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
out_data, in_row_numel, in_rows.size()); out_data, in_row_numel, in_rows.size());
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in SgdOp. Excepted LodTensor or "
"SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} }
}; };
......
...@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) { ...@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
// NOTE: Converting int64 to int32 may cause data overflow. // NOTE: Converting int64 to int32 may cause data overflow.
vec_new_data = std::vector<T>(data, data + x->numel()); vec_new_data = std::vector<T>(data, data + x->numel());
} else { } else {
PADDLE_THROW("The dtype of Tensor must be int32 or int64."); PADDLE_THROW(platform::errors::InvalidArgument(
"The dtype of Tensor must be int32 or int64, but received: %s",
x->type()));
} }
return vec_new_data; return vec_new_data;
} }
...@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList( ...@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
for (size_t i = 0; i < list_tensor.size(); ++i) { for (size_t i = 0; i < list_tensor.size(); ++i) {
auto tensor = list_tensor[i]; auto tensor = list_tensor[i];
PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}), PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
"ShapeError: The shape of Tensor in list must be [1]. " platform::errors::InvalidArgument(
"But received the shape " "The shape of Tensor in list must be [1]. "
"is [%s]", "But received its shape "
tensor->dims()); "is [%s]",
tensor->dims()));
if (tensor->type() == framework::proto::VarType::INT32) { if (tensor->type() == framework::proto::VarType::INT32) {
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
...@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList( ...@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>())); vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
} }
} else { } else {
PADDLE_THROW("The dtype of Tensor in list must be int32 or int64."); PADDLE_THROW(platform::errors::InvalidArgument(
"The dtype of Tensor in list must be int32 or int64, but received: "
"%s",
tensor->type()));
} }
} }
return vec_new_data; return vec_new_data;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/xpu_info.h"
TEST(InitDevices, CPU) { TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
......
此差异已折叠。
此差异已折叠。
...@@ -988,11 +988,6 @@ set +x ...@@ -988,11 +988,6 @@ set +x
fi fi
read testcase <<< $(echo "$line"|grep -oEi "\w+$") read testcase <<< $(echo "$line"|grep -oEi "\w+$")
if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then
echo $testcase" is disabled."
continue
fi
if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
echo $testcase" will only run at night." echo $testcase" will only run at night."
continue continue
......
此差异已折叠。
此差异已折叠。
...@@ -19,10 +19,12 @@ from paddle.fluid import core ...@@ -19,10 +19,12 @@ from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.data_feeder import check_dtype, check_type from paddle.fluid.data_feeder import check_dtype, check_type
from ..utils import deprecated from ..utils import deprecated
from paddle.fluid.framework import static_only
__all__ = ['data'] __all__ = ['data']
@static_only
@deprecated(since="2.0.0", update_to="paddle.static.data") @deprecated(since="2.0.0", update_to="paddle.static.data")
def data(name, shape, dtype='float32', lod_level=0): def data(name, shape, dtype='float32', lod_level=0):
""" """
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册