提交 36cecf61 编写于 作者: S Shang Zhizhou

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_test_activation_op_random_bug
...@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) ...@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git) set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git)
set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984) set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE) CACHE PATH "Warp-ctc Directory" FORCE)
...@@ -44,8 +44,9 @@ ExternalProject_Add( ...@@ -44,8 +44,9 @@ ExternalProject_Add(
"${WARPCTC_DOWNLOAD_CMD}" "${WARPCTC_DOWNLOAD_CMD}"
PREFIX ${WARPCTC_PREFIX_DIR} PREFIX ${WARPCTC_PREFIX_DIR}
SOURCE_DIR ${WARPCTC_SOURCE_DIR} SOURCE_DIR ${WARPCTC_SOURCE_DIR}
UPDATE_COMMAND "" #UPDATE_COMMAND ""
PATCH_COMMAND "" PATCH_COMMAND ""
BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......
...@@ -4,7 +4,7 @@ endif() ...@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu") SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
...@@ -62,9 +62,9 @@ function(op_library TARGET) ...@@ -62,9 +62,9 @@ function(op_library TARGET)
endif() endif()
endif() endif()
if(WITH_XPU) if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}") string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif() endif()
endif() endif()
else() else()
...@@ -83,7 +83,7 @@ function(op_library TARGET) ...@@ -83,7 +83,7 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$") elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src}) list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
......
...@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal " "The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is " "to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.", "%d and the number of places is %d.",
in_var_handles.size(), num_places)); in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
...@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The number of local scopes should be equal " "The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is " "to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.", "%d and the number of places is %d.",
in_var_handles.size(), num_places)); in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() { ...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ( "The number of inputs should be 1, but got %d.",
out_var_handles.size(), places_.size(), in_var_handles.size()));
"The number of output should equal to the number of places."); PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
platform::errors::PreconditionNotMet(
"The number of outputs and the number of places should "
"be equal, but got the number of outputs is %d and the "
"number of places is %d.",
out_var_handles.size(), places_.size()));
VarHandle *in_var_handle = in_var_handles[0]; VarHandle *in_var_handle = in_var_handles[0];
...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
const std::vector<Scope *> &var_scopes) { const std::vector<Scope *> &var_scopes) {
auto *in_var = auto *in_var =
var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
in_var_handle.name()));
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) { if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!"; VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
broadcast_calls.emplace_back( broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] { [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type), send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream())); root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
nccl_ctxs_->DevCtx(p)->Wait(); nccl_ctxs_->DevCtx(p)->Wait();
} }
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} }
} }
...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue( ...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
auto t_out_p = out_var_handle->place(); auto t_out_p = out_var_handle->place();
auto *out_var = var_scopes.at(out_var_handle->scope_idx()) auto *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) { if (is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
......
...@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle { ...@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle { ...@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle { ...@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
float val_scalar = 0.0) { float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto lod_tensor = var->GetMutable<f::LoDTensor>(); auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) { for (size_t k = 0; k < send_vector.size(); ++k) {
...@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle { ...@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
} }
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto selected_rows = var->GetMutable<f::SelectedRows>(); auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value(); auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]); value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
...@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle { ...@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
const std::vector<float>& send_vector, const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) { const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto& selected_rows = var->Get<f::SelectedRows>(); auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value(); auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(selected_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %ld.",
height, selected_rows.height()));
for (size_t k = 0; k < selected_rows.rows().size(); ++k) { for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); PADDLE_ENFORCE_EQ(
selected_rows.rows()[k], rows[k],
platform::errors::InvalidArgument(
"The item at position %zu of rows of SelectedRows "
"is not equal to the expected, expect %ld, but got %ld.",
k, rows[k], selected_rows.rows()[k]));
} }
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
...@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle { ...@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
framework::Scope* scope) { framework::Scope* scope) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
auto var = scope->FindVar(varname); auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto tensor = var->Get<f::LoDTensor>(); auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); PADDLE_ENFORCE_EQ(tensor.lod(), lod,
platform::errors::InvalidArgument(
"The LoD of tensor is not equal to "
"the expected, expect %s, but got %s.",
lod, tensor.lod()));
f::Tensor result_tensor; f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor); f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place); float* ct = result_tensor.mutable_data<float>(cpu_place);
......
...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("reduce_mode_multi_devices_pass").get(); AppendPass("reduce_mode_multi_devices_pass").get();
break; break;
default: default:
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW(
platform::errors::Unimplemented("Unknown reduce strategy."));
} }
} }
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include <memory> #include <memory>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device); BOOST_GET_CONST(platform::CUDAPlace, place).device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_ENFORCE_NOT_NULL(event_); cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
} }
} }
#endif #endif
PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( PADDLE_ENFORCE_NE(vars.empty(), true,
"Variable names are empty.")); platform::errors::InvalidArgument(
"The variables to be deleted are empty."));
for (auto *var : var_infos_) { for (auto *var : var_infos_) {
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
"The memory optimization info is NULL."));
} }
} }
...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
if (event_) { if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device); platform::CUDADeviceGuard guard(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
} }
#endif #endif
} }
...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() { ...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
} }
void EagerDeletionOpHandle::CallOnce() { void EagerDeletionOpHandle::CallOnce() {
PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here"); PADDLE_ENFORCE_EQ(
vars_.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted should be initialized here."));
Scope *exec_scope = local_exec_scopes_[0]; Scope *exec_scope = local_exec_scopes_[0];
for (auto *var_info : var_infos_) { for (auto *var_info : var_infos_) {
auto *var = exec_scope->FindVar(var_info->Name()); auto *var = exec_scope->FindVar(var_info->Name());
PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(
var_info->Name()); var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
var_info->Name()));
vars_.emplace_back(var); vars_.emplace_back(var);
} }
} }
...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
garbages.emplace_back(t.MoveMemoryHolder()); garbages.emplace_back(t.MoveMemoryHolder());
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW(platform::errors::Unimplemented(
framework::ToTypeName(var->Type()), var_info->Name()); "The variable(%s) of type %s is not supported in eager deletion.",
framework::ToTypeName(var->Type()), var_info->Name()));
} }
} }
...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages( ...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
auto callback_stream = auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream(); reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() { auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
}; };
gc_->Add(std::move(*garbages), callback_func); gc_->Add(std::move(*garbages), callback_func);
} else { } else {
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_, in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of places plus the number of all reduce handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of all reduce handles "
"is %d.",
in_var_handles.size(), place_num, num_of_all_reduce_));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of output variable handles, but got the number of input variable "
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of // Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused. // those op are in CPUPlace, in this case, the all reduce should not be fused.
...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
dtype = ele_dtype; dtype = ele_dtype;
} }
PADDLE_ENFORCE_EQ(ele_dtype, dtype); PADDLE_ENFORCE_EQ(
ele_dtype, dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(dtype)));
// Check whether the address space is contiguous. // Check whether the address space is contiguous.
std::sort( std::sort(
...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
"input[%d] address: 0X%02x. The offset: %d", "input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset); next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address, PADDLE_ENFORCE_EQ(
"The address is not consistent."); infer_next_address, next_address,
platform::errors::InvalidArgument(
"The infered address of the next tensor should be equal to the "
"real address of the next tensor. But got infered address is %p "
"and real address is %p.",
infer_next_address, next_address));
} }
} }
if (!FLAGS_skip_fused_all_reduce_check) { if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) { for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, PADDLE_ENFORCE_EQ(
grads_tensor.at(scope_idx).at(j).first); grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first,
platform::errors::InvalidArgument(
"The variable name of grad tensors of "
"fused_all_reduce_op_handle "
"must be consistent. The current name is %s, but the "
"previous name is %s.",
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first));
} }
} }
} }
...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( ...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true; return true;
...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( ...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
size_t place_num = places_.size(); size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); PADDLE_ENFORCE_EQ(
var_name, out_var_handles[j]->name(),
platform::errors::InvalidArgument(
"The name of input variable should be equal "
"to the name of output variable. But got the name of input "
"variable is %s and the name of output variable is %s.",
var_name, out_var_handles[j]->name()));
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)), platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
true, "%s(%d) is not in the right place.", var_name, scope_idx); true, platform::errors::InvalidArgument(
"The variable '%s' at scope %d is not in the right place.",
var_name, scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
} }
} }
...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
size_t size_of_dtype = 0; size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) { for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype // Get dtype
auto ele_type = grad_tensor.at(i).second->type(); auto ele_dtype = grad_tensor.at(i).second->type();
if (i == 0) { if (i == 0) {
*dtype = ele_type; *dtype = ele_dtype;
size_of_dtype = framework::SizeOfType(ele_type); size_of_dtype = framework::SizeOfType(ele_dtype);
} }
PADDLE_ENFORCE_EQ(ele_type, *dtype); PADDLE_ENFORCE_EQ(
ele_dtype, *dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
// Get element number // Get element number
int64_t len = grad_tensor.at(i).second->numel(); int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0); PADDLE_ENFORCE_GT(
len, 0, platform::errors::InvalidArgument(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel += *numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() { ...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
WaitInputVarGenerated(); WaitInputVarGenerated();
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); PADDLE_ENFORCE_EQ(
in_var_handles.size() * place_num, out_var_handles.size(),
platform::errors::PreconditionNotMet(
"The number of input variable handles plus the number "
"of places should be equal to the number of output variable handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of output variable handles "
"is %d.",
in_var_handles.size(), place_num, out_var_handles.size()));
for (size_t i = 0; i < in_var_handles.size(); ++i) { for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar( BroadcastOneVar(
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
...@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not supported."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() { ...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of input variables should be equal "
"to the number of places, but got the number of input variables is "
"%d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(
"The number of output should be one."); out_var_handles.size(), 1,
platform::errors::InvalidArgument(
"The number of output variables should be 1, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() { ...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
auto in_0_handle = in_var_handles[0]; auto in_0_handle = in_var_handles[0];
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var); PADDLE_ENFORCE_NOT_NULL(
pre_in_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
in_0_handle->name()));
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(), PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
"Currently, gather_op only can gather SelectedRows."); platform::errors::Unimplemented(
"Currently, gather_op only supports SelectedRows."));
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(); WaitInputVarGenerated();
...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() { ...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
for (auto *in_handle : in_var_handles) { for (auto *in_handle : in_var_handles) {
auto *in_var = auto *in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var,
platform::errors::NotFound(
"The variable '%s' is not found in the scope.", in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
auto &in_sr_value = in_var->Get<framework::SelectedRows>(); auto &in_sr_value = in_var->Get<framework::SelectedRows>();
...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() { ...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform::Place t_out_p = out_var_handle->place(); platform::Place t_out_p = out_var_handle->place();
if (platform::is_gpu_place(pre_in_value.place())) { if (platform::is_gpu_place(pre_in_value.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
out_var_handle->name()));
auto out_value = out_var->GetMutable<framework::SelectedRows>(); auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height()); out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows); out_value->set_rows(out_rows);
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
namespace paddle { namespace paddle {
...@@ -60,7 +62,8 @@ struct TestGatherOpHandle { ...@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new p::CUDADeviceContext(p));
} }
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -141,7 +144,9 @@ struct TestGatherOpHandle { ...@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -155,7 +160,9 @@ struct TestGatherOpHandle { ...@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
} }
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out"); auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
...@@ -173,9 +180,19 @@ struct TestGatherOpHandle { ...@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
auto& out_select_rows = out_var->Get<f::SelectedRows>(); auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) { ...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
test_op.TestGatherSelectedRows(input_scope_idx); test_op.TestGatherSelectedRows(input_scope_idx);
} }
#endif #endif
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
virtual ~NCCLOpHandleBase() { virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) { for (auto& ev : inter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
for (auto& ev : exter_events_) { for (auto& ev : exter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
} }
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0"); PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
run_order_ = run_order; run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce; use_hierarchical_allreduce_ = use_hierarchical_allreduce;
...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
return; return;
} }
PADDLE_ENFORCE(places_.size() == 1, PADDLE_ENFORCE_EQ(places_.size(), 1,
"HierarchicalAllReduce run one proc with one card mode."); platform::errors::InvalidArgument(
"HierarchicalAllReduce can only run "
"one proccess with one card mode, but got %d cards.",
places_.size()));
for (auto& p : places_) { for (auto& p : places_) {
auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order); auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
continue; continue;
} }
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &exter_events_[dev_id], cudaEventDisableTiming));
VLOG(10) << "Create events on dev_id:" << dev_id VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id] << ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id]; << ", exter_event:" << &exter_events_[dev_id];
...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void FlatNCCLAllReduce(platform::Place place, const void* sendbuff, void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype << ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place; << ", place:" << place;
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
} }
void NCCLAllReduce(platform::Place place, const void* sendbuff, void NCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
if (!use_hierarchical_allreduce_) { if (!use_hierarchical_allreduce_) {
FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return; return;
...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void HierarchicalAllReduce(platform::Place place, const void* sendbuff, void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) { ncclDataType_t datatype, ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
InterReduce(place, sendbuff, recvbuff, count, datatype, op); InterReduce(place, sendbuff, recvbuff, count, datatype, op);
// When a trainer is not in exter allreduce ring // When a trainer is not in exter allreduce ring
// they need not to call this. // they need not to call this.
...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dtype:" << datatype << ", place:" << place << ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream; << ", stream:" << stream;
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
cudaEventRecord(inter_events_.at(dev_id), stream); cudaEventRecord(inter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce inter stream error");
} }
} }
...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_); PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
cudaEventRecord(exter_events_.at(dev_id), stream); cudaEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce exter stream error");
} }
} }
...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", stream:" << stream; << ", stream:" << stream;
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0, PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
comm, stream)); sendbuff, count, datatype, 0, comm, stream));
} }
protected: protected:
......
...@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() { ...@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
...@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() { ...@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
} }
} }
} else { } else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, PADDLE_ENFORCE_EQ(
"%s should have only one dev_ctx.", Name()); dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first; auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) { if (out_var_handle) {
PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()), PADDLE_ENFORCE_EQ(
"The place of output(%s) is not consistent with the " platform::is_same_place(place, out_var_handle->place()), true,
"place of current op(%s).", platform::errors::InvalidArgument(
out_var_handle->Name(), Name()); "The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name()));
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
...@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
InitCUDA(); InitCUDA();
} }
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE_EQ(use_cuda, false,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
#endif #endif
// skip running current op, used with inplace_addto_op_pass // skip running current op, used with inplace_addto_op_pass
...@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx); PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) { for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second); PADDLE_ENFORCE_NOT_NULL(
dev_ctx.second,
platform::errors::InvalidArgument("The device context is NULL."));
dev_ctx.second->Wait(); dev_ctx.second->Wait();
} }
} else { } else {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream(); static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) { for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
} }
} }
#else #else
...@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() { ...@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place)) static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place())) dev_ctxes_.at(in_var_handle->place()))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes( ...@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
auto scopes = GetLocalScopes(); auto scopes = GetLocalScopes();
for (auto *scope : scopes) { for (auto *scope : scopes) {
auto iter = scope_map.find(scope); auto iter = scope_map.find(scope);
PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found"); PADDLE_ENFORCE_NE(
iter, scope_map.end(),
platform::errors::NotFound("Local scope not found in scope map."));
local_exec_scopes_.emplace_back(iter->second); local_exec_scopes_.emplace_back(iter->second);
} }
} }
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> { ...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
PADDLE_ENFORCE_EQ(info->proto_, nullptr, PADDLE_ENFORCE_EQ(info->proto_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpProto of %s has been registered", op_type)); "OpProto of %s has been registered.", op_type));
PADDLE_ENFORCE_EQ(info->checker_, nullptr, PADDLE_ENFORCE_EQ(info->checker_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpAttrChecker of %s has been registered", op_type)); "OpAttrChecker of %s has been registered.", op_type));
info->proto_ = new proto::OpProto; info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker(); info->checker_ = new OpAttrChecker();
T maker; T maker;
maker(info->proto_, info->checker_); maker(info->proto_, info->checker_);
info->proto_->set_type(op_type); info->proto_->set_type(op_type);
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ(
info->proto_->IsInitialized(), info->proto_->IsInitialized(), true,
"Fail to initialize %s's OpProto, because %s is not initialized", platform::errors::PreconditionNotMet(
op_type, info->proto_->InitializationErrorString()); "Fail to initialize %s's OpProto, because %s is not initialized.",
op_type, info->proto_->InitializationErrorString()));
} }
}; };
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -32,9 +33,13 @@ struct ReduceLoDTensor { ...@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
template <typename T> template <typename T>
void apply() const { void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty()); PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
platform::errors::InvalidArgument(
"The number of tensors to be reduced is 0."));
auto &t0 = *src_tensors_[0]; auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0); PADDLE_ENFORCE_NE(t0.numel(), 0,
platform::errors::InvalidArgument(
"The size of first tensor to be reduced is 0."));
dst_tensor_.Resize(t0.dims()); dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace()); T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
...@@ -45,8 +50,19 @@ struct ReduceLoDTensor { ...@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
continue; continue;
} }
PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
PADDLE_ENFORCE_EQ(t.type(), t0.type()); platform::errors::InvalidArgument(
"The shape of tensors to be reduced must be "
"consistent. The shape of current tensor is %s, "
"but the shape of the first tensor is %s.",
t.dims(), t0.dims()));
PADDLE_ENFORCE_EQ(t.type(), t0.type(),
platform::errors::InvalidArgument(
"The type of tensors to be reduced must be "
"consistent. The type of current tensor is %s, "
"but the type of the first tensor is %s.",
t.type(), t0.type()));
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst, std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; }); [](T a, T b) -> T { return a + b; });
} }
...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor { ...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
in_places_(in_places), in_places_(in_places),
out_place_(out_place), out_place_(out_place),
dst_selected_rows_(dst_selected_rows) { dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false); PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
platform::errors::InvalidArgument(
"The number of selected_rows to be gathered is 0."));
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
merged_dev_ctx->Wait(); merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name}); scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(remote.size() == vars.size()); client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows. // 4. merged local selected rows.
std::vector<const SelectedRows *> all; std::vector<const SelectedRows *> all;
...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() { ...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of inputs should equal to the number of places, but got "
"the number of inputs is %d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one."); platform::errors::InvalidArgument(
"The number of output should be one, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
"Variable %s is not found in scope.",
in_0_handle->name()));
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx std::vector<platform::Place> in_places; // used to get dev_ctx
...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() { ...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
in_places.emplace_back(in_handle->place()); in_places.emplace_back(in_handle->place());
auto in_var = auto in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound("Variable %s is not found in scope.",
out_var_handle->name()));
// NOTE: The tensors' Place of input and output must be all on GPU or all on // NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU. // CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p; platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) { if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
t_out_p = out_var_handle->place(); t_out_p = out_var_handle->place();
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
} else { } else {
PADDLE_THROW("only support double or float when gather SelectedRows"); PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
} }
#endif #endif
}); });
...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
size_t numel = static_cast<size_t>(lod_tensor.numel()); size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back( all_reduce_calls.emplace_back(
[buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type), buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() { ...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
lod_tensors[0]->place()));
} }
} }
} }
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -69,7 +71,8 @@ struct TestReduceOpHandle { ...@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -103,7 +106,8 @@ struct TestReduceOpHandle { ...@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -164,7 +168,10 @@ struct TestReduceOpHandle { ...@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -178,7 +185,9 @@ struct TestReduceOpHandle { ...@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -196,9 +205,18 @@ struct TestReduceOpHandle { ...@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
auto &out_select_rows = out_var->Get<f::SelectedRows>(); auto &out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -208,7 +226,7 @@ struct TestReduceOpHandle { ...@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
} }
} } // namespace details
void TestReduceLodTensors(size_t output_scope_idx) { void TestReduceLodTensors(size_t output_scope_idx) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
...@@ -220,7 +238,9 @@ struct TestReduceOpHandle { ...@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>(); auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod); in_lod_tensor->set_lod(lod);
...@@ -230,7 +250,9 @@ struct TestReduceOpHandle { ...@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>(); auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -254,7 +276,7 @@ struct TestReduceOpHandle { ...@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
} }
} }
}; }; // namespace details
TEST(ReduceTester, TestCPUReduceTestSelectedRows) { TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op; TestReduceOpHandle test_op;
......
...@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() { ...@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() {
auto *out_var = exec_scope_->FindVar(out_var_names_[i]); auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound( in_var, platform::errors::NotFound(
"The input variable(%s)to be inplaced should not be NULL.", "The variable(%s) to be inplaced is not found in scope.",
in_var_infos_[i]->Name())); in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_var, out_var, platform::errors::NotFound(
platform::errors::NotFound( "The variable(%s) to be inplaced is not found in scope.",
"The output variable(%s) to be inplaced should not be NULL.", out_var_names_[i]));
out_var_names_[i]));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in_var, out_var, in_var, out_var,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "dgc/dgc.h" #include "dgc/dgc.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( ...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_(is_encoded), is_encoded_(is_encoded),
nranks_(nranks) { nranks_(nranks) {
// TODO(gongwb) :polish them! // TODO(gongwb) :polish them!
PADDLE_ENFORCE_EQ(is_encoded, true); PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
"The argument is_encoded is false."));
VLOG(1) << "Use dgc allreduce mode" VLOG(1) << "Use dgc allreduce mode"
<< ", nranks:" << nranks_; << ", nranks:" << nranks_;
PADDLE_ENFORCE_GT(local_scopes_.size(), 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto nranks_name = g_dgc_nranks; auto nranks_name = g_dgc_nranks;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *local_scope = local_scopes_[i]; auto *local_scope = local_scopes_[i];
auto nranks_var = local_scope->FindVar(nranks_name); auto nranks_var = local_scope->FindVar(nranks_name);
if (nranks_var == nullptr) {
PADDLE_THROW("not find nranks_var:%s", nranks_name); PADDLE_ENFORCE_NOT_NULL(
} nranks_var, platform::errors::NotFound(
"Variable %s is not found in scope.", nranks_name));
float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>(); float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
*dgc_nranks = nranks; *dgc_nranks = nranks;
...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"places, but got the number of input variables is %zu and the the "
"number of places is %zu.",
in_var_handles.size(), places_.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"output variables, but got the number of input variables is %zu and "
"the the number of output variables is %zu.",
in_var_handles.size(), out_var_handles.size()));
std::vector<const LoDTensor *> ins; std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> gathers; std::vector<LoDTensor *> gathers;
...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto encode_var_name = original_name + g_dgc_encoded; auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name); auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
encode_var_name));
auto &in = in_var->Get<LoDTensor>(); auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in); ins.emplace_back(&in);
auto gather_var_name = original_name + g_dgc_gather; auto gather_var_name = original_name + g_dgc_gather;
auto *gather_var = local_scope->FindVar(gather_var_name); auto *gather_var = local_scope->FindVar(gather_var_name);
PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null", PADDLE_ENFORCE_NOT_NULL(
gather_var_name); gather_var, platform::errors::NotFound(
"Variable %s is not found in scope.", gather_var));
auto *gather = gather_var->GetMutable<LoDTensor>(); auto *gather = gather_var->GetMutable<LoDTensor>();
gathers.emplace_back(gather); gathers.emplace_back(gather);
...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
} }
} }
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); platform::is_gpu_place(ins[0]->place()), true,
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
ins[0]->place()));
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(outs[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
outs[0]->place()));
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
"The nccl contexts are NULL."));
int dtype = -1; int dtype = -1;
size_t in_numel = 0; size_t in_numel = 0;
size_t out_numel = 0; size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1); PADDLE_ENFORCE_GT(
nranks_, 1,
platform::errors::PreconditionNotMet(
"The number of ranks should be > 1, but got %d.", nranks_));
std::vector<std::function<void()>> all_gather_calls; std::vector<std::function<void()>> all_gather_calls;
std::vector<std::function<void()>> sparse_reduce_calls; std::vector<std::function<void()>> sparse_reduce_calls;
...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel; in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0); PADDLE_ENFORCE_EQ(in_numel % 2, 0,
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k)); platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel; out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce( PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
gather_buff, k, out_tensor_buf, gather_buff, k, out_tensor_buf,
static_cast<int>(out_numel), nranks_, stream), static_cast<int>(out_numel), nranks_, stream),
true); true, platform::errors::Unavailable(
"Calling sparseReduce() failed."));
}); });
} }
...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc( ...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name); auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k; auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *scope = local_exec_scopes_[0]; auto *scope = local_exec_scopes_[0];
auto var = scope->FindVar(var_name); auto var = scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
auto tensor = var->Get<LoDTensor>().data<float>(); auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor; return *tensor;
} }
...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() { ...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
} }
auto counter_name = g_dgc_counter_name; auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step; auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *local_scope = local_exec_scopes_[0]; auto *local_scope = local_exec_scopes_[0];
auto count_var = local_scope->FindVar(counter_name); auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name); auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, PADDLE_ENFORCE_NOT_NULL(
step_var); count_var, platform::errors::NotFound(
} "Variable %s is not found in scope.", counter_name));
PADDLE_ENFORCE_NOT_NULL(
step_var, platform::errors::NotFound("Variable %s is not found in scope.",
step_var));
float count = *count_var->Get<LoDTensor>().data<float>(); float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>(); float step = *step_var->Get<LoDTensor>().data<float>();
......
...@@ -97,6 +97,7 @@ message AsyncConfig { ...@@ -97,6 +97,7 @@ message AsyncConfig {
optional int32 thread_pool_size = 6 [ default = 1 ]; optional int32 thread_pool_size = 6 [ default = 1 ];
optional int32 send_wait_times = 7 [ default = 1 ]; optional int32 send_wait_times = 7 [ default = 1 ];
optional bool runtime_split_send_recv = 8 [ default = false ]; optional bool runtime_split_send_recv = 8 [ default = false ];
optional bool launch_barrier = 9 [ default = true ];
} }
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; } message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
......
...@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) { ...@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
"Baidu Kunlun Card is properly installed.", "Baidu Kunlun Card is properly installed.",
ret)); ret));
ret = xpu_malloc(reinterpret_cast<void **>(&p), size); ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, PADDLE_ENFORCE_EQ(
platform::errors::External( ret, XPU_SUCCESS,
"XPU API return wrong value[%d], please check whether " platform::errors::External(
"Baidu Kunlun Card is properly installed.", "XPU API return wrong value[%d], no enough memory", ret));
ret));
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"xpu memory FLAGS_init_allocated_mem is not implemented.")); "xpu memory FLAGS_init_allocated_mem is not implemented."));
......
...@@ -891,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> { ...@@ -891,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
} }
}; };
// log Grad: dx = dout / x
// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x)
template <typename T>
class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("log_grad_grad");
op->SetInput("X", this->Input("X"));
// X@GRAD@GRAD: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
op->SetAttrMap(this->Attrs());
// X@GRAD: dx
op->SetOutput("DX", this->InputGrad("X"));
// Out@GRAD@GRAD: ddy
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer,
{framework::GradVarName("Out"), {framework::GradVarName("Out"),
framework::GradVarName("X")}); framework::GradVarName("X")});
...@@ -1272,6 +1294,35 @@ REGISTER_OP_CPU_KERNEL( ...@@ -1272,6 +1294,35 @@ REGISTER_OP_CPU_KERNEL(
ops::AbsGradGradFunctor<int64_t>>); ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_OPERATOR(
log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
paddle::framework::OpDesc>,
ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
paddle::imperative::OpBase>,
ops::ActFwdInplaceInferer);
REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer,
ops::LogDoubleGradMaker<paddle::framework::OpDesc>,
ops::LogDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(
log_grad_grad,
ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CPU_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== register checkpoint ===========================*/ /* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(leaky_relu) REGISTER_OP_VERSION(leaky_relu)
.AddCheckpoint( .AddCheckpoint(
......
...@@ -193,3 +193,15 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -193,3 +193,15 @@ REGISTER_OP_CUDA_KERNEL(
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext, ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<int64_t>>); ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CUDA_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */
...@@ -1663,6 +1663,10 @@ class SquareDoubleGradKernel ...@@ -1663,6 +1663,10 @@ class SquareDoubleGradKernel
} }
}; };
template <typename DeviceContext, typename Functor>
class LogDoubleGradKernel
: public SquareDoubleGradKernel<DeviceContext, Functor> {};
template <typename DeviceContext, typename Functor> template <typename DeviceContext, typename Functor>
class ELUDoubleGradKernel class ELUDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> { : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
...@@ -1852,6 +1856,37 @@ class PowGradKernel ...@@ -1852,6 +1856,37 @@ class PowGradKernel
functor(*place, x, out, dout, dx); functor(*place, x, out, dout, dx);
} }
}; };
template <typename T>
struct LogGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* ddX, framework::Tensor* ddOut,
const framework::Tensor* dOut, framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
// ddout = ddx / x; dx = -(dout / x) * (ddx / x)
// calculate dx first, so ddout can inplace ddx
if (dX) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
auto dx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
ddout.device(*d) = ddx * static_cast<T>(1) / x;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -1872,7 +1907,6 @@ class PowGradKernel ...@@ -1872,7 +1907,6 @@ class PowGradKernel
__macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, Log, LogFunctor, LogGradFunctor); \
__macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \
__macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/activation_op.h"
#include <string>
#include "paddle/fluid/platform/xpu_header.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
template <typename Functor>
class XPUActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename Functor>
class XPUActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename DeviceContext, typename T>
void xpu_activation_forward(const framework::ExecutionContext &ctx,
xpu::Activation_t type) {
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
int r = 0;
if (xpu::Activation_t::ACT_POW == type.type) {
type.pow_factor = ctx.Attr<float>("factor");
}
auto xpu_context = ctx.device_context<DeviceContext>().x_context();
r = xpu::activation_forward(xpu_context, type, x->numel(),
reinterpret_cast<const float *>(x_data),
reinterpret_cast<float *>(y_data));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
template <typename DeviceContext, typename T>
void xpu_activation_backward(const framework::ExecutionContext &ctx,
xpu::Activation_t type) {
/* TODO: relu tanh sigmoid are inplace */
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Out");
auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const T *x_data = nullptr;
const T *y_data = nullptr;
const T *y_grad = nullptr;
if (x != nullptr) x_data = x->data<T>();
if (y != nullptr) y_data = y->data<T>();
if (dOut != nullptr) y_grad = dOut->data<T>();
T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
auto xpu_context = ctx.device_context<DeviceContext>().x_context();
int r = xpu::activation_backward(xpu_context, type, dX->numel(),
reinterpret_cast<const float *>(x_data),
reinterpret_cast<const float *>(y_data),
reinterpret_cast<const float *>(y_grad),
reinterpret_cast<float *>(x_grad));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
template <typename T, xpu::Activation_t::act_enum algorithm>
struct XPUActivationFunc : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
algorithm);
}
};
template <typename T, xpu::Activation_t::act_enum algorithm>
struct XPUActivationGradFunc : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(ctx,
algorithm);
}
};
template <typename T>
using XPUReluFunctor = XPUActivationFunc<T, xpu::Activation_t::RELU>;
template <typename T>
using XPUSigmoidFunctor = XPUActivationFunc<T, xpu::Activation_t::SIGMOID>;
template <typename T>
using XPUTanhFunctor = XPUActivationFunc<T, xpu::Activation_t::TANH>;
template <typename T>
using XPUGeluFunctor = XPUActivationFunc<T, xpu::Activation_t::GELU>;
template <typename T>
using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
template <typename T>
using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
template <typename T>
using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
template <typename T>
using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
template <typename T>
using XPUSigmoidGradFunctor =
XPUActivationGradFunc<T, xpu::Activation_t::SIGMOID>;
template <typename T>
using XPUTanhGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::TANH>;
template <typename T>
using XPUGeluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::GELU>;
template <typename T>
using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
template <typename T>
using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
template <typename T>
using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
template <typename T>
using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_XPU_KERNEL(act_type, \
ops::XPUActivationKernel<ops::functor<float>>); \
REGISTER_OP_XPU_KERNEL( \
act_type##_grad, \
ops::XPUActivationGradKernel<ops::grad_functor<float>>);
REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
XPUSigmoidGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
REGISTER_OP_XPU_KERNEL(log,
ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
REGISTER_OP_XPU_KERNEL(pow,
ops::XPUActivationKernel<ops::XPUACTPowFunctor<float>>);
REGISTER_OP_XPU_KERNEL(abs,
ops::XPUActivationKernel<ops::XPUABSFunctor<float>>);
#endif // PADDLE_WITH_XPU
...@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers, ...@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
while (idy < K) { while (idy < K) {
int64_t id = ids[idy]; int64_t id = ids[idy];
PADDLE_ENFORCE(id >= 0, "received id:", id); PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
PADDLE_ENFORCE(id < N, "received id:", id); PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
id);
T *out = centers_diff + idy * D; T *out = centers_diff + idy * D;
const T *x = X + idy * D; const T *x = X + idy * D;
const T *cent = centers + id * D; const T *cent = centers + id * D;
...@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids, ...@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
while (idy < K) { while (idy < K) {
int count = 1; int count = 1;
int64_t id = ids[idy]; int64_t id = ids[idy];
PADDLE_ENFORCE(id >= 0, "received id:", id); PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
PADDLE_ENFORCE(id < N, "received id:", id); PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
id);
for (int i = 0; i < K; i++) { for (int i = 0; i < K; i++) {
if (ids[i] == id) { if (ids[i] == id) {
......
...@@ -69,8 +69,10 @@ template <typename T> ...@@ -69,8 +69,10 @@ template <typename T>
class CTCAlignOpCUDAKernel : public framework::OpKernel<T> { class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must use CUDAPlace."); platform::errors::InvalidArgument(
"CTCAlign operator CUDA kernel must use CUDAPlace "
"rather than CPUPlace."));
auto* input = ctx.Input<LoDTensor>("Input"); auto* input = ctx.Input<LoDTensor>("Input");
auto* output = ctx.Output<LoDTensor>("Output"); auto* output = ctx.Output<LoDTensor>("Output");
const int blank = ctx.Attr<int>("blank"); const int blank = ctx.Attr<int>("blank");
......
...@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> { ...@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
// check input dims and lod // check input dims and lod
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
input_dims[0], static_cast<int64_t>(input_lod[level].back()), input_dims[0], static_cast<int64_t>(input_lod[level].back()),
"The first dimension of Input(Input) should be equal to " platform::errors::InvalidArgument(
"the sum of all sequences' lengths."); "The first dimension %d of CTCAlign operator Input(Input) should "
"be equal to "
"the sum of all sequences' lengths %d.",
input_dims[0], static_cast<int64_t>(input_lod[level].back())));
const size_t num_sequences = input_lod[level].size() - 1; const size_t num_sequences = input_lod[level].size() - 1;
......
...@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx, ...@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx,
template <typename T> template <typename T>
void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto cpu_place = platform::CPUPlace();
auto &cpu_ctx = *pool.Get(cpu_place);
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id); distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
...@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { ...@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
if (rpc_ctx.origin_varnames.size() == 1 && if (rpc_ctx.origin_varnames.size() == 1 &&
rpc_ctx.splited_varnames.size() == 1) { rpc_ctx.splited_varnames.size() == 1) {
auto varname = rpc_ctx.origin_varnames[0]; auto varname = rpc_ctx.origin_varnames[0];
VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0]; const auto place =
rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx, scope.FindVar(varname)->Get<framework::LoDTensor>().place();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place);
VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
<< platform::is_gpu_place(place);
rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
scope, varname, varname)); scope, varname, varname));
for (size_t i = 0; i < rets.size(); i++) { for (size_t i = 0; i < rets.size(); i++) {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
XPUElementwise<T, XPUAddFunctor<T>>(ctx);
}
};
template <typename DeviceContext, typename T>
class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto dx_dims = dout->dims();
auto dy_dims_untrimed = dout->dims();
T *dx_data = NULL;
T *dy_data = NULL;
int axis = ctx.Attr<int>("axis");
PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
if (dx != nullptr) {
dx->mutable_data<T>(ctx.GetPlace());
dx_dims = dx->dims();
dx_data = dx->data<T>();
}
if (dy != nullptr) {
dy->mutable_data<T>(ctx.GetPlace());
dy_dims_untrimed = dy->dims();
dy_data = dy->data<T>();
}
int pre, n, post, is_common_broadcast;
if (dx_dims == dy_dims_untrimed) {
pre = post = 1;
n = dout->numel();
} else {
axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
"Axis should be in range [0, dx_dims)");
auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
&is_common_broadcast);
}
int len = pre * n * post;
auto &dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
if (post == 1) {
int r = xpu::matrix_vector_add_grad(
dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
return;
}
if (dx == nullptr) {
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
}
if (dy == nullptr) {
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
} else {
if (len != n) {
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
len * sizeof(float)),
XPU_SUCCESS, platform::errors::External(
"XPU has no enough memory"));
}
}
int r = xpu::elementwise_add_grad(
dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
if ((dy != nullptr) && (len != n)) {
r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
post, xpu::ElementwiseOp::ASSIGN);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
dev_ctx.Wait();
xpu_free(dy_data);
}
if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
dev_ctx.Wait();
}
if (dx == nullptr) {
xpu_free(dx_data);
}
if (dy == nullptr) {
xpu_free(dy_data);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
elementwise_add,
ops::ElementwiseAddXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradXPUKernel<
paddle::platform::XPUDeviceContext, float>);
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace operators {
template <typename T>
struct XPUAddFunctor {
int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
return xpu::elementwise_add(ctx, x, y, z, len);
}
};
template <typename T>
struct XPUMulFunctor {
int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
return xpu::elementwise_mul(ctx, x, y, z, len);
}
};
template <typename T, typename Functor>
void XPUElementwise(const framework::ExecutionContext& ctx) {
PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
"This kernel only runs on XPU device.");
auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE_NE(x_var, nullptr,
platform::errors::Fatal("Cannot get input Variable X"));
PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
"XPU only support LoDTensor");
auto x = x_var->Get<framework::LoDTensor>();
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
auto x_dims = x.dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post, is_common_broadcast;
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
int len = pre * n * post;
const T* x_data = x.data<T>();
const T* y_data = y->data<T>();
T* z_data = z->data<T>();
T* y_broadcast = nullptr;
auto& dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
if (post == 1) {
if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
z_data, pre, n);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
return;
}
if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
z_data, pre, n);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
return;
}
}
if (pre != 1 || post != 1) {
PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
len * sizeof(T)) == XPU_SUCCESS);
int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
n, post, xpu::ElementwiseOp::ASSIGN);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
y_data = y_broadcast;
}
Functor functor;
int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
if (pre != 1 || post != 1) {
dev_ctx.Wait();
xpu_free(y_broadcast);
}
}
} // namespace operators
} // namespace paddle
#endif
...@@ -46,6 +46,7 @@ class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -46,6 +46,7 @@ class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
}; };
template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>; template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -52,6 +52,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -52,6 +52,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
}; };
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>; template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
if (x_dim.size() > 1) {
return x_dim;
}
return framework::make_ddim({1, x_dim[0]});
}
static framework::Tensor FoldInitDims(const framework::Tensor &input) {
auto output = input;
auto in_dims = input.dims();
if (in_dims.size() == 3) {
output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
}
return output;
}
/**
* Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
* original y_dim is returned.
*/
static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
if (y_dim.size() > 1) {
return y_dim;
}
return framework::make_ddim({y_dim[0], 1});
}
static void ReshapeTensorIntoMatrixSequence(
framework::Tensor *x, const math::MatDescriptor &descriptor) {
int64_t h, w;
h = descriptor.height_;
w = descriptor.width_;
if (descriptor.trans_) {
std::swap(w, h);
}
if (descriptor.batch_size_) {
x->Resize({descriptor.batch_size_, h, w});
} else {
x->Resize({h, w});
}
}
/**
* Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
* Out = matmul(x, y)
*
* This method will first calculate X,Y matrix sequence, and then calculate
* the out shape.
*
* Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
* The out = [BatchSize, H1, W2]
*
* If there is no batch size in `X` and `Y`, the out will be [H1, W2]
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* BatchSize.
*/
static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
framework::Tensor *y,
framework::Tensor *out, bool trans_x,
bool trans_y) {
auto x_dim = RowMatrixFromVector(x->dims());
auto y_dim = ColumnMatrixFromVector(y->dims());
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else {
out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
mat_dim_x.height_, mat_dim_y.width_});
}
ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
}
template <typename DeviceContext, typename T>
class MatMulXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *x = context.Input<framework::Tensor>("X");
auto *y = context.Input<framework::Tensor>("Y");
auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto mat_dim_a = math::CreateMatrixDescriptor(
RowMatrixFromVector(x->dims()), 0, context.Attr<bool>("transpose_X"));
auto mat_dim_b =
math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0,
context.Attr<bool>("transpose_Y"));
PADDLE_ENFORCE_EQ(
mat_dim_a.width_, mat_dim_b.height_,
platform::errors::InvalidArgument("Shape mistake in matmul_op"));
PADDLE_ENFORCE_EQ(
mat_dim_a.batch_size_, mat_dim_b.batch_size_,
platform::errors::InvalidArgument("Shape mistake in matmul_op"));
T alpha = static_cast<T>(context.Attr<float>("alpha"));
auto &dev_ctx = context.template device_context<DeviceContext>();
float *data_c = out->data<T>();
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
int r =
xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
alpha, x->data<T>(), y->data<T>(), 0.0f, data_c);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else {
// batch matmul
int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
mat_dim_b.trans_, mat_dim_a.batch_size_,
mat_dim_a.height_, mat_dim_b.width_,
mat_dim_a.width_, alpha, x->data<T>(),
y->data<T>(), data_c, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
}
};
// Reshape a rank-3 tensor from P x M x N to M x (P * N).
// (Warning: This requires transposing data and writes into new memory.)
// Identity op if the tensor is not of rank 3.
template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext &context, const framework::Tensor &input) {
auto in_dims = input.dims();
if (in_dims.size() != 3) {
return input;
}
framework::Tensor output;
output.Resize({in_dims[1], in_dims[0], in_dims[2]});
output.mutable_data<T>(context.GetPlace());
std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
static_cast<int>(in_dims[1]),
static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
in_shape_host.data(), axis_host.data(), /*ndims=*/3);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
return output;
}
// Using dimensional constraints on matrix multiplication, it is
// straight-forward to check the following table for when X and Y
// are both matrices.
//
// transpose_X | False | True | False | True
// transpose_Y | False | False | True | True
// -----------+----------+----------+----------+-----------
// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T
// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T
//
// When X is a vector of size K, we treat it instead as a matrix of shape
// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
// a matrix of shape (K, 1).
//
// When X and Y are both 3-dimensional tensors, then the first dimension
// the batch dimension can be ignored and the exact same formulas apply
// as for two matrices.
//
// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
// up with formulas like
//
// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
//
// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
// to X: (P * M) x K, dOut: (P * M) x N.
template <typename DeviceContext, typename T>
class MatMulGradXPUKernel : public framework::OpKernel<T> {
public:
void MatMul(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a,
const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace());
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
PADDLE_ENFORCE_EQ(
mat_dim_a.width_, mat_dim_b.height_,
platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
PADDLE_ENFORCE_EQ(
mat_dim_a.batch_size_, mat_dim_b.batch_size_,
platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
T alpha = static_cast<T>(context.Attr<float>("alpha"));
auto &dev_ctx = context.template device_context<DeviceContext>();
float *data_c = out->data<T>();
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
int r =
xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
alpha, a.data<T>(), b.data<T>(), 0.0f, data_c);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else {
// batch matmul
int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
mat_dim_b.trans_, mat_dim_a.batch_size_,
mat_dim_a.height_, mat_dim_b.width_,
mat_dim_a.width_, alpha, a.data<T>(),
b.data<T>(), data_c, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
}
void CalcInputGrad(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a,
bool is_fold_init_dims_a, const framework::Tensor &b,
bool trans_b, bool is_fold_init_dims_b,
framework::Tensor *out) const {
if (out == nullptr) return;
bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
out->dims().size() == 2;
if (!need_combine) {
MatMul(context, a, trans_a, b, trans_b, out);
} else {
auto &dev_ctx = context.template device_context<DeviceContext>();
MatMul(
context, is_fold_init_dims_a
? FoldInitDims(a)
: XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
trans_a, is_fold_init_dims_b
? FoldInitDims(b)
: XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
trans_b, out);
}
}
void Compute(const framework::ExecutionContext &context) const override {
auto x = *context.Input<framework::Tensor>("X");
auto y = *context.Input<framework::Tensor>("Y");
auto dout =
*context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
bool transpose_x = context.Attr<bool>("transpose_X");
bool transpose_y = context.Attr<bool>("transpose_Y");
ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
framework::DDim dx_dims;
if (dx) {
dx_dims = dx->dims();
if (dx_dims != x.dims()) {
dx->Resize(x.dims());
}
}
framework::DDim dy_dims;
if (dy) {
dy_dims = dy->dims();
if (dy_dims != y.dims()) {
dy->Resize(y.dims());
}
}
if (transpose_x && transpose_y) {
CalcInputGrad(context, y, true, true, dout, true, false, dx);
CalcInputGrad(context, dout, true, true, x, true, false, dy);
} else if (transpose_x) {
CalcInputGrad(context, y, false, false, dout, true, false, dx);
CalcInputGrad(context, x, false, false, dout, false, true, dy);
} else if (transpose_y) {
CalcInputGrad(context, dout, false, false, y, false, true, dx);
CalcInputGrad(context, dout, true, true, x, false, true, dy);
} else {
CalcInputGrad(context, dout, false, false, y, true, false, dx);
CalcInputGrad(context, x, true, true, dout, false, true, dy);
}
if (dx) {
if (dx_dims != x.dims()) {
dx->Resize(dx_dims);
}
}
if (dy) {
if (dy_dims != y.dims()) {
dy->Resize(dy_dims);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
matmul_grad,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
#endif
...@@ -14,11 +14,11 @@ limitations under the License. */ ...@@ -14,11 +14,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/mul_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/operators/mul_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel { ...@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv"); OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
auto dim_x = context->GetInputDim("X"); auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Vec"); auto dim_vec = context->GetInputDim("Vec");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
dim_x.size(), 2, dim_x.size(), 2,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The rank of input X should be 2, but is %d", dim_x.size())); "The rank of input X should be 2, but is %d", dim_x.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
dim_y.size(), 1, dim_vec.size(), 1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The rank of input Vec should be 1, but is %d", dim_y.size())); "The rank of input Vec should be 1, but is %d", dim_vec.size()));
PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true, PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The length of input X' second dim should equal the " "X's second dimension is expected to be equal to "
"length of input Vec," "Vec's first dimension"
" but X[%d, %d], Vec[%d]", "but recieved X'shape = [%s], Vec's shape = [%s]",
dim_x[0], dim_x[1], dim_y[0])); dim_x, dim_vec));
framework::DDim dim_out = framework::make_ddim({dim_x[0]}); framework::DDim dim_out = framework::make_ddim({dim_x[0]});
......
...@@ -19,8 +19,8 @@ namespace paddle { ...@@ -19,8 +19,8 @@ namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout, __global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
const T *vec, T *dx) { const T *vec, T *dx) {
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < m * n; idx += blockDim.x * gridDim.x) { for (; idx < m * n; idx += blockDim.x * gridDim.x) {
int i = idx / n; int i = idx / n;
...@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T> ...@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T>
int m = dim_x[0]; int m = dim_x[0];
int n = dim_x[1]; int n = dim_x[1];
dx->Resize(framework::make_ddim({m * n}));
// get data ptr // get data ptr
const T *x_data = x->data<T>(); const T *x_data = x->data<T>();
const T *vec_data = vec->data<T>(); const T *vec_data = vec->data<T>();
const T *dout_data = dout->data<T>(); const T *dout_data = dout->data<T>();
T *dx_data = dx->mutable_data<T>(context.GetPlace());
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx = auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>(); context.template device_context<platform::CUDADeviceContext>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
// calculate dx
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
auto config = GetGpuLaunchConfig1D(dev_ctx, m * n); auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
MVGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
m, n, dout_data, vec_data, dx_data);
dx->Resize(framework::make_ddim({m, n})); if (dx) {
T *dx_data = dx->mutable_data<T>(context.GetPlace());
MVGradDxCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
m, n, dout_data, vec_data, dx_data);
}
if (dvec) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
// calculate dvec blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data, static_cast<T>(0), dvec_data);
static_cast<T>(0), dvec_data); }
} }
}; };
......
...@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> { ...@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> {
int m = dim_x[0]; int m = dim_x[0];
int n = dim_x[1]; int n = dim_x[1];
dx->Resize(framework::make_ddim({m * n}));
// get data ptr // get data ptr
const T *x_data = x->data<T>(); const T *x_data = x->data<T>();
const T *vec_data = vec->data<T>(); const T *vec_data = vec->data<T>();
const T *dout_data = dout->data<T>(); const T *dout_data = dout->data<T>();
T *dx_data = dx->mutable_data<T>(context.GetPlace()); if (dx) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace()); T *dx_data = dx->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// calculate dx for (int i = 0; i < m; ++i) {
for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) {
for (int j = 0; j < n; ++j) dx_data[i * n + j] = dout_data[i] * vec_data[j];
dx_data[i * n + j] = dout_data[i] * vec_data[j]; }
}
} }
dx->Resize(framework::make_ddim({m, n})); if (dvec) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// calculate dvec blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data, static_cast<T>(0), dvec_data);
static_cast<T>(0), dvec_data); }
} }
}; };
......
...@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel { ...@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of DpsgdOp should not be null."); platform::errors::NotFound(
"Input(Param) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
"Input(Grad) of DpsgdOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, "Input(Grad) of DpsgdOp should not be null."));
"Input(LearningRate) of DpsgdOp should not be null."); PADDLE_ENFORCE_EQ(
ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(), ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::InvalidArgument(
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); "The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front(), ctx->GetInputsVarType("Grad").front(),
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::InvalidArgument(
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); "The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Grad").front()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
"Output(ParamOut) of DpsgdOp should not be null."); platform::errors::NotFound(
"Output(ParamOut) of DpsgdOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 dimension"); platform::errors::InvalidArgument(
"Learning rate should have 1 dimension. But Received "
"LearningRate's dims [%s].",
framework::product(lr_dims)));
auto param_dims = ctx->GetInputDim("Param"); auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"), param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of DpsgdOp should have same dimension"); platform::errors::InvalidArgument(
"Param and Grad input of DpsgdOp should have same dimension. But "
"received Para's dim [%s] and Grad's dim [%s].",
param_dims, ctx->GetInputDim("Grad")));
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
} }
......
...@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> { ...@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
const auto *param_var = ctx.InputVar("Param"); const auto *param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true, PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Param").front(), "but the received is %s",
framework::ToTypeName(param_var->Type())); ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
const auto *grad_var = ctx.InputVar("Grad"); const auto *grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true, PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Grad").front(), "but the received is %s",
framework::ToTypeName(grad_var->Type())); ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate"); const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
......
...@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(param) of Momentum should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(param) of Momentum should not be null."));
"Input(grad) of Momentum should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
PADDLE_ENFORCE(ctx->HasInput("Velocity"), platform::errors::NotFound(
"Input(velocity) of Momentum should not be null."); "Input(grad) of Momentum should not be null."));
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true,
"Input(LearningRate) of Momentum should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE( "Input(velocity) of Momentum should not be null."));
ctx->GetInputsVarType("Param").front() == PADDLE_ENFORCE_EQ(
framework::proto::VarType::LOD_TENSOR, ctx->HasInput("LearningRate"), true,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::NotFound(
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); "Input(LearningRate) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), ctx->GetInputsVarType("Param").front(),
"Output(ParamOut) of Momentum should not be null."); framework::proto::VarType::LOD_TENSOR,
PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), platform::errors::InvalidArgument(
"Output(VelocityOut) of Momentum should not be null."); "The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("VelocityOut"), true,
platform::errors::NotFound(
"Output(VelocityOut) of Momentum should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not " platform::errors::InvalidArgument(
"been initialized. You may need to confirm " "Maybe the Input variable LearningRate has not "
"if you put exe.run(startup_program) " "been initialized. You may need to confirm "
"after optimizer.minimize function."); "if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning_rate should be a scalar"); platform::errors::InvalidArgument(
"Learning_rate should be a scalar. But Received "
"LearningRate's dim [%s]",
framework::product(lr_dims)));
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] == if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) { framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"Param and Grad input of MomentumOp should have the same dimension."); platform::errors::InvalidArgument(
"Param and Grad input of MomentumOp should have the same "
"dimension. But received Param's dim [%s] and Grad's dim [%s].",
param_dim, ctx->GetInputDim("Grad")));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Velocity"), param_dim, ctx->GetInputDim("Velocity"),
"Param and Velocity of MomentumOp should have the same dimension."); platform::errors::InvalidArgument(
"Param and Velocity of MomentumOp should have the same "
"dimension. But received Param's dim [%s] and Velocity [%s].",
param_dim, ctx->GetInputDim("Velocity")));
} }
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
...@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> {
for_range(functor); for_range(functor);
} }
} else { } else {
PADDLE_THROW( PADDLE_ENFORCE_EQ(false, true,
string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " platform::errors::PermissionDenied(
"gradient, but the received Variable Type is %s", "Unsupported Variable Type of Grad "
framework::ToTypeName(grad_var->Type()))); "in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} }
}; };
......
...@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of RmspropOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("MeanSquare"), "Input(Param) of RmspropOp should not be null."));
"Input(MeanSquare) of RmspropOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), ctx->HasInput("MeanSquare"), true,
"Input(LearningRate) of RmspropOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(MeanSquare) of RmspropOp should not be null."));
"Input(Grad) of RmspropOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("Moment"), ctx->HasInput("LearningRate"), true,
"Input(Moment) of RmspropOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE( "Input(LearningRate) of RmspropOp should not be null."));
ctx->GetInputsVarType("Param").front() == PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
framework::proto::VarType::LOD_TENSOR, platform::errors::NotFound(
"The input var's type should be LoDTensor, but the received is %s", "Input(Grad) of RmspropOp should not be null."));
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true,
platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Input(Moment) of RmspropOp should not be null."));
"Output(param_out) of RmspropOp should not be null."); PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), framework::proto::VarType::LOD_TENSOR,
"Output(MomentOut) of RmspropOp should not be null."); platform::errors::InvalidArgument(
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), "The input var's type in RmspropOp should be "
"Output(MeanSquareOut) of RmspropOp should not be null."); "LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(param_out) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MomentOut"), true,
platform::errors::NotFound(
"Output(MomentOut) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MeanSquareOut"), true,
platform::errors::NotFound(
"Output(MeanSquareOut) of RmspropOp should not be null."));
if (ctx->Attrs().Get<bool>("centered")) { if (ctx->Attrs().Get<bool>("centered")) {
PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"), PADDLE_ENFORCE_EQ(
"Output(MeanGradOut) of RmspropOp should not be null."); ctx->HasOutput("MeanGradOut"), true,
platform::errors::NotFound(
"Output(MeanGradOut) of RmspropOp should not be null."));
} }
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"Param and grad input of RmspropOp should have the same dimension."); platform::errors::InvalidArgument(
"Param and grad input of RmspropOp should have the same dimension. "
"But received Param's dim [%s] and Grad's dim [%s].",
param_dim, ctx->GetInputDim("Grad")));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
"Param and Momentum input of RmspropOp " platform::errors::InvalidArgument(
"should have the same dimension."); "Param and Momentum input of RmspropOp "
"should have the same dimension. But received "
"Param's dim [%s] and Moment [%s]",
param_dim, ctx->GetInputDim("Moment")));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
"Param and Momentum input of RmspropOp " platform::errors::InvalidArgument(
"should have the same dimension."); "Param and Momentum input of RmspropOp "
"should have the same dimension. But received "
"Param's dim [%s] and MeanSquare [%s]",
param_dim, ctx->GetInputDim("MeanSquare")));
auto lr_dim = ctx->GetInputDim("LearningRate"); auto lr_dim = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
"Learning Rate should be a scalar."); platform::errors::InvalidArgument(
"Learning Rate of RmspropOp should be a scalar. But "
"received LearningRate's dim [%s]",
framework::product(lr_dim)));
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim);
......
...@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto &mom_tensor = *ctx.Input<LoDTensor>("Moment"); auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
PADDLE_ENFORCE_EQ(&p_tensor, param_out, PADDLE_ENFORCE_EQ(&p_tensor, param_out,
"Param and ParamOut must be the same Tensor"); platform::errors::InvalidArgument(
"Param and ParamOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(&mom_tensor, moment_out, PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
"Moment and MomentOut must be the same Tensor"); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out, "Moment and MomentOut must be the same Tensor"));
"MeanSquare and MeanSquareOut must be the same Tensor"); PADDLE_ENFORCE_EQ(
&ms_tensor, mean_square_out,
platform::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto &dev_ctx = ctx.template device_context<DeviceContext>();
size_t limit = static_cast<size_t>(ms_tensor.numel()); size_t limit = static_cast<size_t>(ms_tensor.numel());
...@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor); auto mg = EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, PADDLE_ENFORCE_EQ(
"MeanGrad and MeanGradOut must be the same Tensor"); &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out); auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g; mg_out.device(place) = rho * mg + (1 - rho) * g;
...@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
if (centered) { if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, PADDLE_ENFORCE_EQ(
"MeanGrad and MeanGradOut must be the same Tensor"); &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>( for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()), mean_square_out->mutable_data<T>(ctx.GetPlace()),
...@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
if (centered) { if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, PADDLE_ENFORCE_EQ(
"MeanGrad and MeanGradOut must be the same Tensor"); &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>( for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()), mean_square_out->mutable_data<T>(ctx.GetPlace()),
...@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> {
rho, epsilon, momentum, grad_func)); rho, epsilon, momentum, grad_func));
} }
} else { } else {
PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient"); PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in RmspropOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} }
}; };
......
...@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of SGDOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(Param) of SGDOp should not be null."));
"Input(Grad) of SGDOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), ctx->HasInput("Grad"), true,
"Input(LearningRate) of SGDOp should not be null."); platform::errors::NotFound("Input(Grad) of SGDOp should not be null."));
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
"Output(ParamOut) of SGDOp should not be null."); platform::errors::NotFound(
"Input(LearningRate) of SGDOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of SGDOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not " platform::errors::NotFound(
"been initialized. You may need to confirm " "Maybe the Input variable LearningRate has not "
"if you put exe.run(startup_program) " "been initialized. You may need to confirm "
"after optimizer.minimize function."); "if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element"); platform::errors::InvalidArgument(
"Learning rate should have 1 element. But received "
"LearningRate dims [%s]",
framework::product(lr_dims)));
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] == if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) { framework::proto::VarType::LOD_TENSOR) {
......
...@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T> ...@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param"); const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Param").front(), "but the received is %s",
framework::ToTypeName(param_var->Type())); ctx.InputNames("Param").front(),
paddle::framework::ToTypeName(param_var->Type())));
auto* param = ctx.Input<framework::Tensor>("Param"); auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut"); auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
...@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T> ...@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
// TODO(qijun): In Sparse SGD operator, in-place update is enforced. // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency. // This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution. // It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out); PADDLE_ENFORCE_EQ(
param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp should be equal with ParamOut "
"if variable's type is SelectedRows."));
auto* grad = ctx.Input<framework::SelectedRows>("Grad"); auto* grad = ctx.Input<framework::SelectedRows>("Grad");
auto in_height = grad->height(); auto in_height = grad->height();
auto out_dims = param_out->dims(); auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp should be "
"equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
in_height, out_dims[0]));
auto& in_value = grad->value(); auto& in_value = grad->value();
auto& in_rows = grad->rows(); auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height,
platform::errors::InvalidArgument(
"The in_row_numel of SgdOp should be equal with "
"param_out's numel / in_height."));
auto* in_data = in_value.data<T>(); auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>(); auto* out_data = param_out->data<T>();
...@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T> ...@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
out_data, in_row_numel, in_rows.size()); out_data, in_row_numel, in_rows.size());
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in SgdOp. Excepted LodTensor or "
"SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} }
}; };
......
...@@ -44,8 +44,20 @@ class SGDOpKernel<platform::CPUDeviceContext, T> ...@@ -44,8 +44,20 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
if (grad_var->IsType<framework::LoDTensor>()) { if (grad_var->IsType<framework::LoDTensor>()) {
const auto *grad = ctx.Input<framework::Tensor>("Grad"); const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto sz = param_out->numel(); auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz); PADDLE_ENFORCE_EQ(param->numel(), sz,
PADDLE_ENFORCE_EQ(grad->numel(), sz); platform::errors::InvalidArgument(
"The input tensor Param's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Param's "
"numel = [%s], ParamOut's numel = [%s]",
param->numel(), sz));
PADDLE_ENFORCE_EQ(grad->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Grad's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Grad's "
"numel = [%s], ParamOut's numel = [%s]",
grad->numel(), sz));
jit::sgd_attr_t attr(1, sz, 1, sz, 1); jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>(); const T *lr = learning_rate->data<T>();
...@@ -62,7 +74,11 @@ class SGDOpKernel<platform::CPUDeviceContext, T> ...@@ -62,7 +74,11 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
// TODO(qijun): In Sparse SGD operator, in-place update is enforced. // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency. // This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution. // It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out); PADDLE_ENFORCE_EQ(param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp "
"should be equal with ParamOut if variable's "
"type is SelectedRows. "));
const auto *grad = ctx.Input<framework::SelectedRows>("Grad"); const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
auto &grad_rows = grad->rows(); auto &grad_rows = grad->rows();
...@@ -73,7 +89,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T> ...@@ -73,7 +89,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
} }
auto out_dims = param_out->dims(); auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]); PADDLE_ENFORCE_EQ(
grad->height(), out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp "
"should be equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
grad->height(), out_dims[0]));
auto &grad_value = grad->value(); auto &grad_value = grad->value();
const T *param_data = param->data<T>(); const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>(); const T *grad_data = grad_value.data<T>();
...@@ -87,19 +109,31 @@ class SGDOpKernel<platform::CPUDeviceContext, T> ...@@ -87,19 +109,31 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
attr.grad_height = grad_rows.size(); // note: it is not grad->height() attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height; attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size(); attr.selected_rows_size = grad_rows.size();
PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); PADDLE_ENFORCE_EQ(
attr.grad_width, attr.param_width,
platform::errors::InvalidArgument(
"The grad_value's numel of SgdOp "
"should be equal with param_out's numel. But received "
"grad_value's numel [%s] and param_out's numel [%s]",
attr.grad_width, attr.param_width));
auto sgd = auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At( jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr); attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_ENFORCE_EQ(
false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
} }
} else if (param_var->IsType<framework::SelectedRows>()) { } else if (param_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(), PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
"when param " platform::errors::InvalidArgument(
"is SelectedRows, gradient should also be SelectedRows"); "when param is SelectedRows, "
"gradient should also be SelectedRows"));
const auto &param = param_var->Get<framework::SelectedRows>(); const auto &param = param_var->Get<framework::SelectedRows>();
auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut"); auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
const auto &grad = grad_var->Get<framework::SelectedRows>(); const auto &grad = grad_var->Get<framework::SelectedRows>();
...@@ -112,27 +146,36 @@ class SGDOpKernel<platform::CPUDeviceContext, T> ...@@ -112,27 +146,36 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
auto param_row_width = param.value().dims()[1]; auto param_row_width = param.value().dims()[1];
auto grad_row_width = grad.value().dims()[1]; auto grad_row_width = grad.value().dims()[1];
VLOG(4) << " param rows: " << param.rows().size() PADDLE_ENFORCE_EQ(
<< " param memory rows: " << param.value().dims()[0] param_row_width, grad_row_width,
<< " grad rows: " << grad.rows().size() platform::errors::InvalidArgument(
<< " grad memory rows: " << grad.value().dims()[0]; "The param_row in SgdOP should have the same size with grad_row. "
PADDLE_ENFORCE_EQ(param_row_width, grad_row_width, "But received param_row's width is [%s], and grad_row's width is "
"param_row should have the same size with grad_row"); "[%s]",
param_row_width, grad_row_width));
const auto *lr = learning_rate->data<T>(); const auto *lr = learning_rate->data<T>();
const auto *grad_data = grad.value().data<T>(); const auto *grad_data = grad.value().data<T>();
auto *out_data = param_out->mutable_value()->data<T>(); auto *out_data = param_out->mutable_value()->data<T>();
for (size_t i = 0; i < grad.rows().size(); i++) { for (size_t i = 0; i < grad.rows().size(); i++) {
int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false); int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0), PADDLE_ENFORCE_GE(
"id should be in the table"); id_index, static_cast<int64_t>(0),
platform::errors::InvalidArgument(
"The id in SgdOp should be >= 0. But recevied id_index is [%s]",
id_index));
for (int64_t j = 0; j < grad_row_width; j++) { for (int64_t j = 0; j < grad_row_width; j++) {
out_data[id_index * grad_row_width + j] -= out_data[id_index * grad_row_width + j] -=
lr[0] * grad_data[i * grad_row_width + j]; lr[0] * grad_data[i * grad_row_width + j];
} }
} }
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Parameter"); PADDLE_ENFORCE_EQ(
false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Parameter in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(param_var->Type())));
} }
} }
}; };
......
...@@ -45,8 +45,10 @@ template <typename T> ...@@ -45,8 +45,10 @@ template <typename T>
class PoolCUDNNOpKernel : public framework::OpKernel<T> { class PoolCUDNNOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, PADDLE_ENFORCE_EQ(
"It must use CUDAPlace."); platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
"CUDAPlace rather than CPUPlace."));
const Tensor *input = ctx.Input<Tensor>("X"); const Tensor *input = ctx.Input<Tensor>("X");
Tensor *output = ctx.Output<Tensor>("Out"); Tensor *output = ctx.Output<Tensor>("Out");
...@@ -175,8 +177,10 @@ template <typename T> ...@@ -175,8 +177,10 @@ template <typename T>
class PoolCUDNNGradOpKernel : public framework::OpKernel<T> { class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, PADDLE_ENFORCE_EQ(
"It must use CUDAPlace."); platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
"CUDAPlace rather than CPUPlace."));
const Tensor *input = ctx.Input<Tensor>("X"); const Tensor *input = ctx.Input<Tensor>("X");
const Tensor *output = ctx.Input<Tensor>("Out"); const Tensor *output = ctx.Input<Tensor>("Out");
......
...@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1, ...@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
} }
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
output_size, 0, output_size, 0,
"ShapeError: the output size must be greater than 0. But received: " platform::errors::InvalidArgument(
"output_size = %d due to the settings of input_size(%d), padding(%d,%d), " "the output size must be greater than 0. But received: "
"k_size(%d) and stride(%d). Please check again!", "output_size = %d due to the settings of input_size(%d), "
output_size, input_size, padding_1, padding_2, filter_size, stride); "padding(%d,%d), "
"k_size(%d) and stride(%d). Please check again!",
output_size, input_size, padding_1, padding_2, filter_size, stride));
return output_size; return output_size;
} }
void PoolOp::InferShape(framework::InferShapeContext* ctx) const { void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, PADDLE_ENFORCE_EQ(
"X(Input) of Pooling should not be null."); ctx->HasInput("X"), true,
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, platform::errors::NotFound("Input(X) of Pool operator is not found."));
"Out(Output) of Pooling should not be null."); PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of Pool operator is not found."));
std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type"); std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize"); std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
...@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
auto in_x_dims = ctx->GetInputDim("X"); auto in_x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_x_dims.size() == 4 || in_x_dims.size() == 5, true, in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
"ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But " platform::errors::InvalidArgument(
"received: %u-D Tensor and it's shape is [%s].", "the input of Op(pool) should be 4-D or 5-D Tensor. But "
in_x_dims.size(), in_x_dims); "received: %u-D Tensor and it's shape is [%s].",
in_x_dims.size(), in_x_dims));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_x_dims.size() - ksize.size(), 2U, in_x_dims.size() - ksize.size(), 2U,
"ShapeError: the dimension of input minus the size of " platform::errors::InvalidArgument(
"Attr(ksize) must be euqal to 2 in Op(pool). " "the dimension of input minus the size of "
"But received: the dimension of input minus the size " "Attr(ksize) must be euqal to 2 in Op(pool). "
"of Attr(ksize) is %d, the " "But received: the dimension of input minus the size "
"input's dimension is %d, the shape of input " "of Attr(ksize) is %d, the "
"is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].", "input's dimension is %d, the shape of input "
in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims, "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
ksize.size(), framework::make_ddim(ksize)); in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
ksize.size(), framework::make_ddim(ksize)));
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
"ShapeError: the size of Attr(ksize) and Attr(strides) in " PADDLE_ENFORCE_EQ(
"Op(pool) must be equal. " ksize.size(), strides.size(),
"But received: Attr(ksize)'s size is %d, Attr(strides)'s " platform::errors::InvalidArgument(
"size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].", "the size of Attr(ksize) and Attr(strides) in "
ksize.size(), strides.size(), framework::make_ddim(ksize), "Op(pool) must be equal. "
framework::make_ddim(strides)); "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
"size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
ksize.size(), strides.size(), framework::make_ddim(ksize),
framework::make_ddim(strides)));
// MKL-DNN Kernels are using NCHW order of dims description // MKL-DNN Kernels are using NCHW order of dims description
// so we ignore data_format consideration for MKL-DNN kernel // so we ignore data_format consideration for MKL-DNN kernel
...@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( ...@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
} }
void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::NotFound(
"Input(X) of Pool Gradoperator is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
"Input(X@GRAD) should not be null."); platform::errors::NotFound(
"Input(X@GRAD) of Pool Gradoperator is not found."));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
...@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( ...@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used"); platform::errors::InvalidArgument(
"Float16 can only be used when CUDNN is used"));
} }
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
library_); library_);
......
...@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling, ...@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
} }
} else { } else {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
data_dims.size() * 2, paddings->size(), platform::errors::InvalidArgument(
"Paddings size should be the same or twice as the pooling size."); "Paddings size %d should be the same or twice as the "
"pooling size %d.",
paddings->size(), data_dims.size() * 2));
} }
// when padding_algorithm is "VALID" or "SAME" // when padding_algorithm is "VALID" or "SAME"
...@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
pool_process, exclusive, adaptive, out); pool_process, exclusive, adaptive, out);
} }
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
} }
} }
}; };
...@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> { ...@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
adaptive, in_x_grad); adaptive, in_x_grad);
} }
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
} }
} }
} }
......
...@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { ...@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
bool adaptive = ctx->Attrs().Get<bool>("adaptive"); bool adaptive = ctx->Attrs().Get<bool>("adaptive");
PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, PADDLE_ENFORCE(
"Pooling intput should be 4-D or 5-D tensor."); in_x_dims.size() == 4 || in_x_dims.size() == 5,
platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
"tensor but received %dD-Tensor",
in_x_dims.size()));
if (ctx->Attrs().Get<bool>("global_pooling")) { if (ctx->Attrs().Get<bool>("global_pooling")) {
ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2); ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
...@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { ...@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
} }
} }
PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U, PADDLE_ENFORCE_EQ(
platform::errors::InvalidArgument( in_x_dims.size() - ksize.size(), 2U,
"Input size and pooling size should be consistent.")); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), "The input size %d minus the kernel size %d should equal to 2.",
platform::errors::InvalidArgument( in_x_dims.size(), ksize.size()));
"Strides size and pooling size should be the same.")); PADDLE_ENFORCE_EQ(
ksize.size(), strides.size(),
platform::errors::InvalidArgument(
"Strides size %d and pooling size %d should be the same.",
strides.size(), ksize.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ksize.size(), paddings.size(), ksize.size(), paddings.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Paddings size and pooling size should be the same.")); "Paddings size %d and pooling size %d should be the same.",
paddings.size(), ksize.size()));
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]}); std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
if (adaptive) { if (adaptive) {
......
...@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> { ...@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
mask); mask);
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
} }
} }
}; };
...@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> { ...@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
paddings, adaptive, in_x_grad); paddings, adaptive, in_x_grad);
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
} }
} }
} }
......
...@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
int height = in_dims[2]; int height = in_dims[2];
int width = in_dims[3]; int width = in_dims[3];
PADDLE_ENFORCE_EQ(input_channels, PADDLE_ENFORCE_EQ(
output_channels * pooled_height * pooled_width, input_channels, output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of " platform::errors::InvalidArgument(
"output_channels x pooled_height x pooled_width"); "The channels %d of input X should equal the product of "
"output_channels %d x pooled_height %d x pooled_width %d.",
input_channels, output_channels, pooled_height, pooled_width));
int rois_num = rois->dims()[0]; int rois_num = rois->dims()[0];
if (rois_num == 0) return; if (rois_num == 0) return;
auto rois_lod = rois->lod().back(); auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1; int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
rois_batch_size, batch_size, platform::errors::InvalidArgument(
"The rois_batch_size and input(X) batch_size must be the same."); "The batch size of input(ROIs) and input(X) must be "
"the same but received batch size of input(ROIs) and "
"input(X) is %d and %d respectively.",
rois_batch_size, batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size]; int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same."); platform::errors::InvalidArgument(
"The number of rois from input(ROIs) and its LOD "
"must be the same. Received rois %d of input(ROIs) "
"but the number of rois %d from its LOD is %d",
rois_num, rois_num_with_lod));
// set rois batch id // set rois batch id
framework::Tensor rois_batch_id_list; framework::Tensor rois_batch_id_list;
......
...@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
if (ctx.HasInput("RoisNum")) { if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<Tensor>("RoisNum"); auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
int rois_batch_size = rois_num_t->numel(); int rois_batch_size = rois_num_t->numel();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size, rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same."); platform::errors::InvalidArgument(
"The batch size of input(ROIs) and input(X) must be the same but "
"received batch size of input(ROIs) and input(X) is %d and %d "
"respectively.",
rois_batch_size, batch_size));
std::vector<int> rois_num_list(rois_batch_size); std::vector<int> rois_num_list(rois_batch_size);
memory::Copy(cplace, rois_num_list.data(), gplace, memory::Copy(cplace, rois_num_list.data(), gplace,
rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0); rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
...@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
int rois_batch_size = rois_lod.size() - 1; int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size, rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same."); platform::errors::InvalidArgument(
"The batch size of input(ROIs) and input(X) must be the same but "
"received batch size of input(ROIs) and input(X) is %d and %d "
"respectively.",
rois_batch_size, batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size]; int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same."); platform::errors::InvalidArgument(
"The number of rois from input(ROIs) and its LOD "
"must be the same. Received rois %d of input(ROIs) "
"but the number of rois %d from its LOD is %d",
rois_num, rois_num_with_lod));
for (int n = 0; n < rois_batch_size; ++n) { for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n; roi_batch_id_data[i] = n;
......
...@@ -103,13 +103,13 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,13 +103,13 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
"Target sequence length for Label when Label is a 2-D tensor.") "Target sequence length for Label when Label is a 2-D tensor.")
.AsDispensable(); .AsDispensable();
AddOutput("WarpCTCGrad", AddOutput("WarpCTCGrad",
"(Tensor, default: Tensor<float>), a temporary " "(Tensor), a temporary "
"output Tensor to store the gradients of warp-ctc, which is " "output Tensor to store the gradients of warp-ctc, which is "
"computed with loss together in one call. It is a 3-D Tensor of " "computed with loss together in one call. It is a 3-D Tensor of "
"the shape [max_sequence_length, batch_size, num_classes + 1].") "the shape [max_sequence_length, batch_size, num_classes + 1].")
.AsIntermediate(); .AsIntermediate();
AddOutput("Loss", AddOutput("Loss",
"(Tensor, default: Tensor<float>), the Connectionist " "(Tensor), the Connectionist "
"Temporal Classification (CTC) loss, which is a 2-D Tensor of " "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
"the shape [batch_size, 1]"); "the shape [batch_size, 1]");
AddAttr<int>("blank", AddAttr<int>("blank",
...@@ -197,7 +197,9 @@ REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, ...@@ -197,7 +197,9 @@ REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp, REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp,
ops::WarpCTCGradOpNoNeedBufferVarInferer); ops::WarpCTCGradOpNoNeedBufferVarInferer);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>); warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
warpctc_grad, warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>); ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -16,7 +16,9 @@ limitations under the License. */ ...@@ -16,7 +16,9 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>); warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
warpctc_grad, warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>); ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -27,7 +27,52 @@ namespace operators { ...@@ -27,7 +27,52 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class ComputeCtcLossFunctor {
public:
ctcStatus_t operator()(const T* const activations, T* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths, int alphabet_size,
int minibatch, T* costs, void* workspace,
ctcOptions options) {
return CTC_STATUS_EXECUTION_FAILED;
}
};
template <typename DeviceContext>
class ComputeCtcLossFunctor<DeviceContext, float> {
public:
ctcStatus_t operator()(const float* const activations, float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths, int alphabet_size,
int minibatch, float* costs, void* workspace,
ctcOptions options) {
return platform::dynload::compute_ctc_loss(
activations, gradients, flat_labels, label_lengths, input_lengths,
static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
workspace, options);
}
};
template <typename DeviceContext> template <typename DeviceContext>
class ComputeCtcLossFunctor<DeviceContext, double> {
public:
ctcStatus_t operator()(const double* const activations, double* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths, int alphabet_size,
int minibatch, double* costs, void* workspace,
ctcOptions options) {
return platform::dynload::compute_ctc_loss_double(
activations, gradients, flat_labels, label_lengths, input_lengths,
static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
workspace, options);
}
};
template <typename DeviceContext, typename T>
class WarpCTCFunctor { class WarpCTCFunctor {
public: public:
/* /*
...@@ -51,21 +96,29 @@ class WarpCTCFunctor { ...@@ -51,21 +96,29 @@ class WarpCTCFunctor {
* \param blank blank label used in ctc loss function. * \param blank blank label used in ctc loss function.
* \param cpu_losss cost of each sequence in CPU memory. * \param cpu_losss cost of each sequence in CPU memory.
*/ */
void operator()(const framework::ExecutionContext& ctx, const float* input, void operator()(const framework::ExecutionContext& ctx, const T* input,
float* gradient, const int* cpu_labels, T* gradient, const int* cpu_labels,
const int* cpu_label_lengths, const int* cpu_input_lengths, const int* cpu_label_lengths, const int* cpu_input_lengths,
const size_t sequence_width, const size_t num_sequences, const size_t sequence_width, const size_t num_sequences,
const size_t blank, float* cpu_loss) { const size_t blank, T* cpu_loss) {
// Init warp-ctc options // Init warp-ctc options
init(ctx, blank); init(ctx, blank);
// Compute the required workspace size. // Compute the required workspace size.
// There is no memory allocated operations within warp-ctc. // There is no memory allocated operations within warp-ctc.
size_t workspace_bytes = 0; size_t workspace_bytes = 0;
ctcStatus_t status = platform::dynload::get_workspace_size( ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width), if (sizeof(T) == 4) {
static_cast<int>(num_sequences), options_, &workspace_bytes); status = platform::dynload::get_workspace_size(
cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences),
options_, &workspace_bytes);
} else {
status = platform::dynload::get_workspace_size_double(
cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences),
options_, &workspace_bytes);
}
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
CTC_STATUS_SUCCESS, status, CTC_STATUS_SUCCESS, status,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
...@@ -79,17 +132,17 @@ class WarpCTCFunctor { ...@@ -79,17 +132,17 @@ class WarpCTCFunctor {
workspace_bytes)); workspace_bytes));
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL; size_t workspace_elements = workspace_bytes / sizeof(T) + 1UL;
Tensor workspace = ctx.AllocateTmpTensor<float, DeviceContext>( Tensor workspace = ctx.AllocateTmpTensor<T, DeviceContext>(
framework::make_ddim({static_cast<int64_t>(workspace_elements)}), framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
dev_ctx); dev_ctx);
float* workspace_data = workspace.data<float>(); T* workspace_data = workspace.data<T>();
math::SetConstant<DeviceContext, float>()( math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), &workspace, ctx.template device_context<DeviceContext>(), &workspace,
static_cast<float>(0)); static_cast<T>(0));
// compute loss and gradient // compute loss and gradient
status = platform::dynload::compute_ctc_loss( status = ComputeCtcLossFunctor<DeviceContext, T>()(
input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths, input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences), static_cast<int>(sequence_width), static_cast<int>(num_sequences),
cpu_loss, workspace_data, options_); cpu_loss, workspace_data, options_);
...@@ -112,7 +165,8 @@ class WarpCTCFunctor { ...@@ -112,7 +165,8 @@ class WarpCTCFunctor {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
#else #else
PADDLE_THROW("[warpctc init] GPU is not enabled."); PADDLE_THROW(platform::errors::PreconditionNotMet(
"[warpctc init] GPU is not enabled."));
#endif #endif
} else { } else {
options_.loc = CTC_CPU; options_.loc = CTC_CPU;
...@@ -292,7 +346,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -292,7 +346,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank")); const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
WarpCTCFunctor<DeviceContext>()( WarpCTCFunctor<DeviceContext, T>()(
ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data, ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
warpctc_label_lengths.data(), warpctc_logits_lengths.data(), warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
sequence_width, num_sequences, blank, warpctc_loss_data); sequence_width, num_sequences, blank, warpctc_loss_data);
......
...@@ -53,7 +53,9 @@ extern void* warpctc_dso_handle; ...@@ -53,7 +53,9 @@ extern void* warpctc_dso_handle;
__macro(get_warpctc_version); \ __macro(get_warpctc_version); \
__macro(ctcGetStatusString); \ __macro(ctcGetStatusString); \
__macro(compute_ctc_loss); \ __macro(compute_ctc_loss); \
__macro(get_workspace_size) __macro(compute_ctc_loss_double); \
__macro(get_workspace_size); \
__macro(get_workspace_size_double)
WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/xpu_info.h"
TEST(InitDevices, CPU) { TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
......
...@@ -15,9 +15,36 @@ ...@@ -15,9 +15,36 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/errors.h"
#include "xpu/api.h" #include "xpu/api.h"
#include "xpu/runtime.h" #include "xpu/runtime.h"
#include "xpu/runtime_ex.h" #include "xpu/runtime_ex.h"
namespace xpu = baidu::xpu::api; namespace xpu = baidu::xpu::api;
class XPUActHelper {
public:
// Convert string to activation type in xpu
static xpu::Activation_t ConvertToXpuActType(
const std::string& act_type_str) {
static std::unordered_map<std::string, xpu::Activation_t> str2act = {
{"linear", xpu::Activation_t::LINEAR},
{"relu", xpu::Activation_t::RELU},
{"sigmoid", xpu::Activation_t::SIGMOID},
{"tanh", xpu::Activation_t::TANH},
{"gelu", xpu::Activation_t::GELU},
{"leaky_relu", xpu::Activation_t::LEAKY_RELU},
{"sqrt", xpu::Activation_t::SQRT},
{"square", xpu::Activation_t::SQUARE}};
auto res = str2act.find(act_type_str);
PADDLE_ENFORCE_NE(res, str2act.end(),
paddle::platform::errors::InvalidArgument(
"Invalid activation type(%s) in XPU", act_type_str));
return res->second;
}
};
#endif #endif
...@@ -48,6 +48,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -48,6 +48,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"collect_fpn_proposals", {"collect_fpn_proposals",
{"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
{"distribute_fpn_proposals", {"FpnRois", "RoisNum"}}, {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
{"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
}; };
// NOTE(zhiqiu): Like op_ins_map. // NOTE(zhiqiu): Like op_ins_map.
......
...@@ -230,7 +230,6 @@ from .framework import CPUPlace #DEFINE_ALIAS ...@@ -230,7 +230,6 @@ from .framework import CPUPlace #DEFINE_ALIAS
from .framework import CUDAPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS
from .framework import CUDAPinnedPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS
from .framework import to_variable #DEFINE_ALIAS
from .framework import grad #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS
from .framework import no_grad #DEFINE_ALIAS from .framework import no_grad #DEFINE_ALIAS
from .framework import save #DEFINE_ALIAS from .framework import save #DEFINE_ALIAS
...@@ -258,6 +257,8 @@ from .tensor.stat import numel #DEFINE_ALIAS ...@@ -258,6 +257,8 @@ from .tensor.stat import numel #DEFINE_ALIAS
from .device import get_cudnn_version from .device import get_cudnn_version
from .device import set_device from .device import set_device
from .device import get_device from .device import get_device
from .device import is_compiled_with_xpu
from .device import XPUPlace
# from .tensor.tensor import Tensor #DEFINE_ALIAS # from .tensor.tensor import Tensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS # from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS
......
...@@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv ...@@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
__all__ = [ __all__ = [
'get_cudnn_version', 'get_cudnn_version',
'set_device', 'set_device',
'get_device' 'get_device',
'XPUPlace',
'is_compiled_with_xpu'
# 'cpu_places', # 'cpu_places',
# 'CPUPlace', # 'CPUPlace',
# 'cuda_pinned_places', # 'cuda_pinned_places',
...@@ -35,6 +37,37 @@ __all__ = [ ...@@ -35,6 +37,37 @@ __all__ = [
_cudnn_version = None _cudnn_version = None
def is_compiled_with_xpu():
"""
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
Returns (bool): whether paddle was built with WITH_XPU=ON
Examples:
.. code-block:: python
import paddle
support_xpu = paddle.device.is_compiled_with_xpu()
"""
return core.is_compiled_with_xpu()
def XPUPlace(dev_id):
"""
Return a Baidu Kunlun Place
Parameters:
dev_id(int): Baidu Kunlun device id
Examples:
.. code-block:: python
import paddle
place = paddle.device.XPUPlace(0)
"""
return core.XPUPlace(dev_id)
def get_cudnn_version(): def get_cudnn_version():
""" """
This funciton return the version of cudnn. the retuen value is int which represents the This funciton return the version of cudnn. the retuen value is int which represents the
......
...@@ -30,6 +30,7 @@ __all__ = [ ...@@ -30,6 +30,7 @@ __all__ = [
] ]
fleet = Fleet() fleet = Fleet()
_final_strategy = fleet._final_strategy
init = fleet.init init = fleet.init
is_first_worker = fleet.is_first_worker is_first_worker = fleet.is_first_worker
worker_index = fleet.worker_index worker_index = fleet.worker_index
......
...@@ -1244,8 +1244,7 @@ class DistributedStrategy(object): ...@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
if getattr(self.strategy, f.name): if getattr(self.strategy, f.name):
draws += border + "\n" draws += border + "\n"
draws += h1_format.format( draws += h1_format.format(
"{} = True, please check {}_configs".format( "{}=True <-> {}_configs".format(f.name, f.name))
f.name, f.name))
draws += line + "\n" draws += line + "\n"
my_configs = getattr(self.strategy, my_configs = getattr(self.strategy,
f.name + "_configs") f.name + "_configs")
......
...@@ -119,6 +119,8 @@ class Fleet(object): ...@@ -119,6 +119,8 @@ class Fleet(object):
self.strategy_compiler = None self.strategy_compiler = None
self._is_collective = False self._is_collective = False
self._runtime_handle = None self._runtime_handle = None
self._util = None
self._context = {}
def init(self, role_maker=None, is_collective=False): def init(self, role_maker=None, is_collective=False):
""" """
...@@ -233,7 +235,7 @@ class Fleet(object): ...@@ -233,7 +235,7 @@ class Fleet(object):
Returns: Returns:
int: worker numbers int: worker numbers
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -569,8 +571,9 @@ class Fleet(object): ...@@ -569,8 +571,9 @@ class Fleet(object):
if strategy == None: if strategy == None:
strategy = DistributedStrategy() strategy = DistributedStrategy()
self.user_defined_strategy = strategy
self.valid_strategy = None self._user_defined_strategy = copy.deepcopy(strategy)
self._context = {}
return self return self
@dygraph_only @dygraph_only
...@@ -909,6 +912,15 @@ class Fleet(object): ...@@ -909,6 +912,15 @@ class Fleet(object):
# imitate target optimizer retrieval # imitate target optimizer retrieval
return self.user_defined_optimizer.clear_grad() return self.user_defined_optimizer.clear_grad()
def _final_strategy(self):
if "valid_strategy" not in self._context:
print(
"WARNING: You may need to call minimize function before this function is called"
)
return {}
else:
return self._context["valid_strategy"]
def minimize(self, def minimize(self,
loss, loss,
startup_program=None, startup_program=None,
...@@ -958,12 +970,15 @@ class Fleet(object): ...@@ -958,12 +970,15 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/FleetX # for more examples, please reference https://github.com/PaddlePaddle/FleetX
""" """
context = {}
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
if paddle.fluid.framework.in_dygraph_mode(): if paddle.fluid.framework.in_dygraph_mode():
# imitate target optimizer retrieval # imitate target optimizer retrieval
target_opt = self.user_defined_optimizer target_opt = self.user_defined_optimizer
self._context = context
return target_opt.minimize(loss) return target_opt.minimize(loss)
context = {}
# cache original feed forward program # cache original feed forward program
self.origin_main_program = loss.block.program self.origin_main_program = loss.block.program
context["origin_main_program"] = self.origin_main_program context["origin_main_program"] = self.origin_main_program
...@@ -984,17 +999,19 @@ class Fleet(object): ...@@ -984,17 +999,19 @@ class Fleet(object):
MetaOptimizerFactory()._get_valid_meta_optimizers( MetaOptimizerFactory()._get_valid_meta_optimizers(
self.user_defined_optimizer) self.user_defined_optimizer)
context["user_defined_strategy"] = copy.copy(self.user_defined_strategy) context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
# trigger the auto-parallel in very strict condition # trigger the auto-parallel in very strict condition
# strategy = DistributedStrategy() # strategy = DistributedStrategy()
# strategy.auto = True # strategy.auto = True
# optimizer = paddle.optimizer.SGD(learning_rate=0.1) # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
# optimizer = fleet.distributed_optimizer(optimizer, strategy) # optimizer = fleet.distributed_optimizer(optimizer, strategy)
if self.user_defined_strategy._is_strict_auto(): if copy_user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer # turn on all the strategy for each optimizer
for opt in distributed_optimizer_list: for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy, context) opt._enable_strategy(copy_user_defined_strategy, context)
valid_optimizer_list = [] valid_optimizer_list = []
valid_graph_optimizer_list = [] valid_graph_optimizer_list = []
...@@ -1003,7 +1020,7 @@ class Fleet(object): ...@@ -1003,7 +1020,7 @@ class Fleet(object):
for opt in distributed_optimizer_list: for opt in distributed_optimizer_list:
opt._set_basic_info(loss, self._role_maker, opt._set_basic_info(loss, self._role_maker,
self.user_defined_optimizer, self.user_defined_optimizer,
self.user_defined_strategy) copy_user_defined_strategy)
if opt._can_apply() and not opt._is_graph_out(): if opt._can_apply() and not opt._is_graph_out():
valid_optimizer_list.append(opt) valid_optimizer_list.append(opt)
elif opt._can_apply() and opt._is_graph_out(): elif opt._can_apply() and opt._is_graph_out():
...@@ -1014,13 +1031,15 @@ class Fleet(object): ...@@ -1014,13 +1031,15 @@ class Fleet(object):
meta_optimizer, graph_optimizer = \ meta_optimizer, graph_optimizer = \
self.strategy_compiler.generate_optimizer( self.strategy_compiler.generate_optimizer(
loss, self._role_maker, self.user_defined_optimizer, loss, self._role_maker, self.user_defined_optimizer,
self.user_defined_strategy, valid_optimizer_list, copy_user_defined_strategy, valid_optimizer_list,
valid_graph_optimizer_list) valid_graph_optimizer_list)
valid_strategy = self.strategy_compiler._get_valid_strategy( valid_strategy = self.strategy_compiler._get_valid_strategy(
self.user_defined_strategy, can_not_apply_optimizer_list) copy_user_defined_strategy, can_not_apply_optimizer_list)
context["valid_strategy"] = copy.deepcopy(valid_strategy)
context["valid_strategy"] = valid_strategy self._context = context
self.valid_strategy = valid_strategy self.valid_strategy = valid_strategy
self.valid_strategy._enable_env() self.valid_strategy._enable_env()
......
...@@ -495,7 +495,7 @@ class RoleMakerBase(object): ...@@ -495,7 +495,7 @@ class RoleMakerBase(object):
Returns: Returns:
string: all heter_trainers'endpoints string: all heter_trainers'endpoints
""" """
assert self._heter_trainer_endpoints != [] assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
return self._heter_trainer_endpoints return self._heter_trainer_endpoints
def _get_heter_worker_endpoint(self): def _get_heter_worker_endpoint(self):
...@@ -505,10 +505,10 @@ class RoleMakerBase(object): ...@@ -505,10 +505,10 @@ class RoleMakerBase(object):
e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter) e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainer
""" """
assert self._heter_trainer_endpoints != [] assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
return self._heter_trainer_endpoints[(self._current_id + 1) % return self._heter_trainer_endpoints[(self._current_id) %
self._heter_worker_num()] self._heter_worker_num()]
def _get_heter_worker_device(self): def _get_heter_worker_device(self):
......
...@@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor ...@@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor
from paddle.fluid.parallel_executor import ParallelExecutor from paddle.fluid.parallel_executor import ParallelExecutor
from .runtime_base import RuntimeBase from .runtime_base import RuntimeBase
from ..base.private_helper_function import wait_server_ready
class ParameterServerRuntime(RuntimeBase): class ParameterServerRuntime(RuntimeBase):
...@@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase):
return False return False
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.READER: var.desc.type() == core.VarDesc.VarType.READER:
return False return False
return var.persistable return var.persistable
...@@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase):
trainer_config = self.async_strategy.get_trainer_runtime_config() trainer_config = self.async_strategy.get_trainer_runtime_config()
dist_strategy = self.context["valid_strategy"]
launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
if launch_barrier:
# for trainer wait server ready
wait_server_ready(self.role_maker._get_pserver_endpoints())
# for ps-heter mode, wait heter worker ready
if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
):
wait_server_ready(self.role_maker._get_heter_worker_endpoints())
lrs = _has_global_step(_get_lr_ops(self.origin_main_program)) lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
if lrs: if lrs:
...@@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase):
opts = _get_optimize_ops(self.origin_main_program) opts = _get_optimize_ops(self.origin_main_program)
for op in opts: for op in opts:
if "Param" in op.input_names and \ if "Param" in op.input_names and \
"LearningRate" in op.input_names and op.input("Param")[0] == param_name: "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
return op return op
def _save_dense_params(self, executor, dirname, context, main_program): def _save_dense_params(self, executor, dirname, context, main_program):
......
...@@ -1291,17 +1291,17 @@ def append_backward(loss, ...@@ -1291,17 +1291,17 @@ def append_backward(loss,
It will be automatically invoked by the optimizer's `minimize` function. It will be automatically invoked by the optimizer's `minimize` function.
Parameters: Parameters:
loss( :ref:`api_guide_Variable_en` ): The loss variable of the network. loss(Tensor): The loss Tensor of the network.
parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
that need to be updated by optimizers. that need to be updated by optimizers.
If it is None, all parameters If it is None, all parameters
will be updated. will be updated.
Default: None. Default: None.
no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with should be ignored. All Tensors with
`stop_gradient=True` from all blocks will `stop_gradient=True` from all blocks will
be automatically added into this set. be automatically added into this set.
If this parameter is not None, the Variables or Variable.names in this set will be added to the default set. If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
Default: None. Default: None.
callbacks(list[callable object], optional): List of callback functions. callbacks(list[callable object], optional): List of callback functions.
The callbacks are used for The callbacks are used for
...@@ -1312,70 +1312,73 @@ def append_backward(loss, ...@@ -1312,70 +1312,73 @@ def append_backward(loss,
new gradient operator is added new gradient operator is added
into the program. The callable into the program. The callable
object must have two input object must have two input
parameters: 'block' and 'context'. parameters: ``block`` and ``context`` .
The 'block' is the :ref:`api_guide_Block_en` which The ``block`` is the :ref:`api_guide_Block_en` which
the new gradient operator will the new gradient operator will
be added to. The 'context' is a be added to. The ``context`` is a
map, whose keys are gradient map, whose keys are gradient
variable names and values are Tensor names and values are
corresponding original :ref:`api_guide_Variable_en` . corresponding original :ref:`api_guide_tensor_en` .
In addition to this, the 'context' In addition to this, the ``context``
has another special key-value pair: has another special key-value pair:
the key is string '__current_op_desc__' the key is string ``__current_op_desc__``
and the value is the op_desc of the and the value is the op_desc of the
gradient operator who has just gradient operator who has just
triggered the callable object. triggered the callable object.
Default: None. Default: None.
Returns: Returns:
list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients. list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
The key is the parameter and the value is gradient variable. The key is the parameter and the value is gradient Tensor.
Raises: Raises:
AssertionError: If `loss` is not an instance of Variable. AssertionError: If ``loss`` is not an instance of Tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import paddle.nn.functional as F
x = fluid.data(name='x', shape=[None, 13], dtype='int64') paddle.enable_static()
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
x_emb = fluid.embedding(x, size=[100, 256]) x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc') y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
loss = fluid.layers.square_error_cost(input=y_predict, label=y) x_emb = paddle.static.nn.embedding(x, size=[100, 256])
avg_loss = fluid.layers.mean(loss) y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
loss = F.square_error_cost(input=y_predict, label=y)
avg_loss = paddle.mean(loss)
# Get all weights in main_program, not include bias. # Get all weights in main_program, not include bias.
all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name] all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
all_weights_name = [w.name for w in all_weights] all_weights_name = [w.name for w in all_weights]
# return all param_grads needed to be updated if parameter_list set default None. # return all param_grads needed to be updated if parameter_list set default None.
p_g_list1 = fluid.backward.append_backward(loss=avg_loss) p_g_list1 = paddle.static.append_backward(loss=avg_loss)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)] # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
# return the param_grads corresponding to parameter_list that can be list of param (Variable). # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights) p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)] # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# parameter_list can be list of param.name (str). # parameter_list can be list of param.name (str).
p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name) p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)] # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# no_grad_set can be set of Variables that means grad will be cut off from these Variables. # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb])) p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
# output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)] # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
# no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly. # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0'])) p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)] # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# return [] because all param_grads are filtered by no_grad_set. # return [] because all param_grads are filtered by no_grad_set.
p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights)) p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
""" """
check_type(loss, 'loss', framework.Variable, check_type(loss, 'loss', framework.Variable,
'fluid.backward.append_backward') 'paddle.static.append_backward')
if loss.op is None: if loss.op is None:
# the loss is from a cloned program. Find loss op manually. # the loss is from a cloned program. Find loss op manually.
...@@ -1387,7 +1390,7 @@ def append_backward(loss, ...@@ -1387,7 +1390,7 @@ def append_backward(loss,
if callbacks is not None: if callbacks is not None:
check_type(callbacks, 'callbacks', list, check_type(callbacks, 'callbacks', list,
'fluid.backward.append_backward') 'paddle.static.append_backward')
program = loss.block.program program = loss.block.program
root_block = program.block(0) root_block = program.block(0)
...@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
Backpropagate the gradients of targets to inputs. Backpropagate the gradients of targets to inputs.
Args: Args:
targets(Variable|list[Variable]): The target variables targets(Tensor|list[Tensor]): The target Tensors
inputs(Variable|list[Variable]): The input variables inputs(Tensor|list[Tensor]): The input Tensors
target_gradients (Variable|list[Variable], optional): The gradient variables target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
of targets which has the same shape with targets, If None, ones will of targets which has the same shape with targets, If None, ones will
be created for them. be created for them.
no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with should be ignored. All Tensors with
`stop_gradient=True` from all blocks will `stop_gradient=True` from all blocks will
be automatically added into this set. be automatically added into this set.
If this parameter is not None, the Variables or Variable.names in this set will be added to the default set. If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
Default: None. Default: None.
Return: Return:
(list[Variable]): A list of gradients for inputs (list[Tensor]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable If an input does not affect targets, the corresponding gradient Tensor
will be None will be None
""" """
targets = _as_list(targets) targets = _as_list(targets)
...@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
Backpropagate the gradients of targets to inputs. Backpropagate the gradients of targets to inputs.
Args: Args:
targets (Variable|list[Variable]): The target variables. targets (Tensor|list[Tensor]): The target Tensors.
inputs (Variable|list[Variable]): The input variables. inputs (Tensor|list[Tensor]): The input Tensors.
target_gradients (Variable|list[Variable], optional): The gradient variables target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
of targets which has the same shape with targets, If None, ones will of targets which has the same shape with targets, If None, ones will
be created for them. be created for them.
no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with `stop_gradient=True` from all blocks will should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
be automatically added into this set. If this parameter is not None, the Variables or Variable.names be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
in this set will be added to the default set. Default: None. in this set will be added to the default set. Default: None.
Return: Return:
(list[Variable]): A list of gradients for inputs (list[Tensor]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable If an input does not affect targets, the corresponding gradient Tensor
will be None. will be None.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import paddle.nn.functional as F
paddle.enable_static()
x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32') x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
x.stop_gradient=False x.stop_gradient=False
y = fluid.layers.conv2d(x, 4, 1, bias_attr=False) y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
y = fluid.layers.relu(y) y = F.relu(y)
y = fluid.layers.conv2d(y, 4, 1, bias_attr=False) z = paddle.static.gradients([y], x)
y = fluid.layers.relu(y) print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
z = fluid.gradients([y], x)
print(z)
""" """
check_type(targets, 'targets', (framework.Variable, list), check_type(targets, 'targets', (framework.Variable, list),
'fluid.backward.gradients') 'paddle.static.gradients')
check_type(inputs, 'inputs', (framework.Variable, list), check_type(inputs, 'inputs', (framework.Variable, list),
'fluid.backward.gradients') 'paddle.static.gradients')
check_type(target_gradients, 'target_gradients', ( check_type(target_gradients, 'target_gradients', (
framework.Variable, list, type(None)), 'fluid.backward.gradients') framework.Variable, list, type(None)), 'paddle.static.gradients')
outs = calc_gradient(targets, inputs, target_gradients, no_grad_set) outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
return _as_list(outs) return _as_list(outs)
...@@ -3230,14 +3230,11 @@ class Flatten(layers.Layer): ...@@ -3230,14 +3230,11 @@ class Flatten(layers.Layer):
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle import to_variable
import numpy as np import numpy as np
paddle.disable_static()
inp_np = np.ones([5, 2, 3, 4]).astype('float32') inp_np = np.ones([5, 2, 3, 4]).astype('float32')
inp_np = paddle.to_tensor(inp_np)
paddle.disable_static()
inp_np = to_variable(inp_np)
flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2) flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
flatten_res = flatten(inp_np) flatten_res = flatten(inp_np)
......
...@@ -5396,13 +5396,13 @@ def program_guard(main_program, startup_program=None): ...@@ -5396,13 +5396,13 @@ def program_guard(main_program, startup_program=None):
""" """
:api_attr: Static Graph :api_attr: Static Graph
Change the global main program and startup program with `"with"` statement. Change the global main program and startup program with ``with`` statement.
Layer functions in the Python `"with"` block will append operators and Layer functions in the Python ``with`` block will append operators and
variables to the new main programs. Tensors to the new main programs.
Args: Args:
main_program(Program): New main program inside `"with"` statement. main_program(Program): New main program inside ``with`` statement.
startup_program(Program, optional): New startup program inside `"with"` startup_program(Program, optional): New startup program inside ``with``
statement. :code:`None` means not changing startup program, statement. :code:`None` means not changing startup program,
default_startup_program is still used. default_startup_program is still used.
Default: None. Default: None.
...@@ -5410,13 +5410,14 @@ def program_guard(main_program, startup_program=None): ...@@ -5410,13 +5410,14 @@ def program_guard(main_program, startup_program=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
main_program = fluid.Program() paddle.enable_static()
startup_program = fluid.Program() main_program = paddle.static.Program()
with fluid.program_guard(main_program, startup_program): startup_program = paddle.static.Program()
data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32') with paddle.static.program_guard(main_program, startup_program):
hidden = fluid.layers.fc(input=data, size=10, act='relu') data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
Notes: The temporary :code:`Program` can be used if the user does not need Notes: The temporary :code:`Program` can be used if the user does not need
to construct either of startup program or main program. to construct either of startup program or main program.
...@@ -5424,20 +5425,22 @@ def program_guard(main_program, startup_program=None): ...@@ -5424,20 +5425,22 @@ def program_guard(main_program, startup_program=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
main_program = fluid.Program() paddle.enable_static()
# does not care about startup program. Just pass a temporary value. main_program = paddle.static.Program()
with fluid.program_guard(main_program, fluid.Program()): # does not care about startup program. Just pass a temporary value.
data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32') with paddle.static.program_guard(main_program, paddle.static.Program()):
data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
""" """
from .data_feeder import check_type from .data_feeder import check_type
check_type(main_program, 'main_program', Program, 'fluid.program_guard') check_type(main_program, 'main_program', Program,
'paddle.static.program_guard')
main_program = switch_main_program(main_program) main_program = switch_main_program(main_program)
if startup_program is not None: if startup_program is not None:
check_type(startup_program, 'startup_program', Program, check_type(startup_program, 'startup_program', Program,
'fluid.program_guard') 'paddle.static.program_guard')
startup_program = switch_startup_program(startup_program) startup_program = switch_startup_program(startup_program)
try: try:
yield yield
......
...@@ -2488,9 +2488,6 @@ def _error_message(what, arg_name, op_name, right_value, error_value): ...@@ -2488,9 +2488,6 @@ def _error_message(what, arg_name, op_name, right_value, error_value):
def case(pred_fn_pairs, default=None, name=None): def case(pred_fn_pairs, default=None, name=None):
''' '''
:api_attr: Static Graph :api_attr: Static Graph
:alias_main: paddle.nn.case
:alias: paddle.nn.case,paddle.nn.control_flow.case
:old_api: paddle.fluid.layers.case
This operator works like an if-elif-elif-else chain. This operator works like an if-elif-elif-else chain.
...@@ -2500,7 +2497,7 @@ def case(pred_fn_pairs, default=None, name=None): ...@@ -2500,7 +2497,7 @@ def case(pred_fn_pairs, default=None, name=None):
name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
Variable|list(Variable): Tensors returned by the callable from the first pair whose pred is True, Tensor|list(Tensor): Tensors returned by the callable from the first pair whose pred is True,
or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None, or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None,
or Tensors returned by the last callable in ``pred_fn_pairs`` if no pred in ``pred_fn_pairs`` is True and ``default`` is None. or Tensors returned by the last callable in ``pred_fn_pairs`` if no pred in ``pred_fn_pairs`` is True and ``default`` is None.
...@@ -2508,45 +2505,47 @@ def case(pred_fn_pairs, default=None, name=None): ...@@ -2508,45 +2505,47 @@ def case(pred_fn_pairs, default=None, name=None):
TypeError: If the type of ``pred_fn_pairs`` is not list or tuple. TypeError: If the type of ``pred_fn_pairs`` is not list or tuple.
TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple. TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple.
TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2. TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2.
TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not Variable. TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not a Tensor.
TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable. TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable.
TypeError: If ``default`` is not None but it is not callable. TypeError: If ``default`` is not None but it is not callable.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import paddle.fluid.layers as layers
paddle.enable_static()
def fn_1(): def fn_1():
return layers.fill_constant(shape=[1, 2], dtype='float32', value=1) return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
def fn_2(): def fn_2():
return layers.fill_constant(shape=[2, 2], dtype='int32', value=2) return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
def fn_3(): def fn_3():
return layers.fill_constant(shape=[3], dtype='int32', value=3) return paddle.fill_constant(shape=[3], dtype='int32', value=3)
main_program = fluid.default_startup_program() main_program = paddle.static.default_startup_program()
startup_program = fluid.default_main_program() startup_program = paddle.static.default_main_program()
with fluid.program_guard(main_program, startup_program):
x = layers.fill_constant(shape=[1], dtype='float32', value=0.3) with paddle.static.program_guard(main_program, startup_program):
y = layers.fill_constant(shape=[1], dtype='float32', value=0.1) x = paddle.fill_constant(shape=[1], dtype='float32', value=0.3)
z = layers.fill_constant(shape=[1], dtype='float32', value=0.2) y = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
z = paddle.fill_constant(shape=[1], dtype='float32', value=0.2)
pred_1 = layers.less_than(z, x) # true: 0.2 < 0.3 pred_1 = paddle.less_than(z, x) # true: 0.2 < 0.3
pred_2 = layers.less_than(x, y) # false: 0.3 < 0.1 pred_2 = paddle.less_than(x, y) # false: 0.3 < 0.1
pred_3 = layers.equal(x, y) # false: 0.3 == 0.1 pred_3 = paddle.equal(x, y) # false: 0.3 == 0.1
# Call fn_1 because pred_1 is True # Call fn_1 because pred_1 is True
out_1 = layers.case( out_1 = paddle.static.nn.case(
pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3) pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
# Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called. # Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called.
# because fn_3 is the last callable in pred_fn_pairs. # because fn_3 is the last callable in pred_fn_pairs.
out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)]) out_2 = paddle.static.nn.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
exe = fluid.Executor(fluid.CPUPlace()) exe = paddle.static.Executor(paddle.CPUPlace())
res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2]) res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2])
print(res_1) # [[1. 1.]] print(res_1) # [[1. 1.]]
print(res_2) # [3 3 3] print(res_2) # [3 3 3]
...@@ -3610,18 +3609,18 @@ def switch_case(branch_index, branch_fns, default=None, name=None): ...@@ -3610,18 +3609,18 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
This operator is like a C++ switch/case statement. This operator is like a C++ switch/case statement.
Args: Args:
branch_index(Variable): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``. branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors. branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
default(callable, optional): Callable that returns a structure of Tensors. default(callable, optional): Callable that returns a structure of Tensors.
name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
Variable|list(Variable): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``, Tensor|list(Tensor): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``, or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``,
or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``. or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``.
Raises: Raises:
TypeError: If the type of ``branch_index`` is not Variable. TypeError: If the type of ``branch_index`` is not Tensor.
TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``. TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``.
TypeError: If the type of ``branch_fns`` is not dict, list or tuple. TypeError: If the type of ``branch_fns`` is not dict, list or tuple.
TypeError: If the elements of ``branch_fns`` is not 2-tuple. TypeError: If the elements of ``branch_fns`` is not 2-tuple.
...@@ -3633,40 +3632,41 @@ def switch_case(branch_index, branch_fns, default=None, name=None): ...@@ -3633,40 +3632,41 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import paddle.fluid.layers as layers
paddle.enable_static()
def fn_1(): def fn_1():
return layers.fill_constant(shape=[1, 2], dtype='float32', value=1) return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
def fn_2(): def fn_2():
return layers.fill_constant(shape=[2, 2], dtype='int32', value=2) return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
def fn_3(): def fn_3():
return layers.fill_constant(shape=[3], dtype='int32', value=3) return paddle.fill_constant(shape=[3], dtype='int32', value=3)
main_program = fluid.default_startup_program() main_program = paddle.static.default_startup_program()
startup_program = fluid.default_main_program() startup_program = paddle.static.default_main_program()
with fluid.program_guard(main_program, startup_program): with paddle.static.program_guard(main_program, startup_program):
index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1) index_1 = paddle.fill_constant(shape=[1], dtype='int32', value=1)
index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2) index_2 = paddle.fill_constant(shape=[1], dtype='int32', value=2)
out_1 = layers.switch_case( out_1 = paddle.static.nn.switch_case(
branch_index=index_1, branch_index=index_1,
branch_fns={1: fn_1, 2: fn_2}, branch_fns={1: fn_1, 2: fn_2},
default=fn_3) default=fn_3)
out_2 = layers.switch_case( out_2 = paddle.static.nn.switch_case(
branch_index=index_2, branch_index=index_2,
branch_fns=[(1, fn_1), (2, fn_2)], branch_fns=[(1, fn_1), (2, fn_2)],
default=fn_3) default=fn_3)
# Argument default is None and no index matches. fn_3 will be called because of the max index 7. # Argument default is None and no index matches. fn_3 will be called because of the max index 7.
out_3 = layers.switch_case( out_3 = paddle.static.nn.switch_case(
branch_index=index_2, branch_index=index_2,
branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)]) branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
exe = fluid.Executor(fluid.CPUPlace()) exe = paddle.static.Executor(paddle.CPUPlace())
res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3]) res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
print(res_1) # [[1. 1.]] print(res_1) # [[1. 1.]]
print(res_2) # [[2 2] [2 2]] print(res_2) # [[2 2] [2 2]]
......
...@@ -541,7 +541,7 @@ def warpctc(input, ...@@ -541,7 +541,7 @@ def warpctc(input,
(not including the blank label). When it is a 3-D Tensor, its shape (not including the blank label). When it is a 3-D Tensor, its shape
is `[max_logit_length, batch_size, num_classes + 1]`, is `[max_logit_length, batch_size, num_classes + 1]`,
where `max_logit_length` is the longest length of where `max_logit_length` is the longest length of
input logit sequence. The data type must be float32. input logit sequence. The data type should be float32 or float64.
label (Variable): The ground truth of variable-length sequence, label (Variable): The ground truth of variable-length sequence,
which must be a 2-D Tensor with LoD information or a 3-D Tensor without which must be a 2-D Tensor with LoD information or a 3-D Tensor without
LoD information, needs to be consistent with the coressponding input. LoD information, needs to be consistent with the coressponding input.
...@@ -571,6 +571,7 @@ def warpctc(input, ...@@ -571,6 +571,7 @@ def warpctc(input,
.. code-block:: python .. code-block:: python
# using LoDTensor # using LoDTensor
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
...@@ -581,6 +582,7 @@ def warpctc(input, ...@@ -581,6 +582,7 @@ def warpctc(input,
# class num # class num
class_num = 5 class_num = 5
paddle.enable_static()
logits = fluid.data(name='logits',shape=[None, class_num+1], logits = fluid.data(name='logits',shape=[None, class_num+1],
dtype='float32',lod_level=1) dtype='float32',lod_level=1)
label = fluid.data(name='label', shape=[None, 1], label = fluid.data(name='label', shape=[None, 1],
...@@ -602,6 +604,7 @@ def warpctc(input, ...@@ -602,6 +604,7 @@ def warpctc(input,
.. code-block:: python .. code-block:: python
# using Tensor # using Tensor
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
...@@ -613,6 +616,7 @@ def warpctc(input, ...@@ -613,6 +616,7 @@ def warpctc(input,
batch_size = 16 batch_size = 16
# class num # class num
class_num = 5 class_num = 5
paddle.enable_static()
logits = fluid.data(name='logits', logits = fluid.data(name='logits',
shape=[max_seq_length, batch_size, class_num+1], shape=[max_seq_length, batch_size, class_num+1],
dtype='float32') dtype='float32')
...@@ -637,8 +641,23 @@ def warpctc(input, ...@@ -637,8 +641,23 @@ def warpctc(input,
fetch_list=[cost.name]) fetch_list=[cost.name])
print(output) print(output)
""" """
if in_dygraph_mode():
if input_length is None or label_length is None:
raise ValueError(
"input_length and label_length must not be None in dygraph mode!"
)
grad, loss_out = core.ops.warpctc(
input,
label,
input_length,
label_length,
'blank',
blank,
'norm_by_times',
norm_by_times, )
return loss_out
helper = LayerHelper('warpctc', **locals()) helper = LayerHelper('warpctc', **locals())
check_variable_and_dtype(input, 'input', ['float32'], "warpctc") check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
check_variable_and_dtype(label, 'label', ['int32'], "warpctc") check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
this_inputs = {'Logits': [input], 'Label': [label]} this_inputs = {'Logits': [input], 'Label': [label]}
if input_length is not None and label_length is not None: if input_length is not None and label_length is not None:
......
...@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS}) ...@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
py_test_modules(test_warpctc_op MODULES test_warpctc_op) # disable test_warpctc_op
# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS}) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS}) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
...@@ -531,15 +532,15 @@ if(NOT WIN32) ...@@ -531,15 +532,15 @@ if(NOT WIN32)
endif() endif()
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140) bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
endif() endif()
add_subdirectory(sequence) add_subdirectory(sequence)
......
...@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7, ...@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
return line return line
def prepare_fake_data(file_nums=9, file_lines=1000): def prepare_fake_data(file_nums=6, file_lines=1000):
""" """
Create fake data with same type as avazu_ctr_data Create fake data with same type as avazu_ctr_data
""" """
......
...@@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase): ...@@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
debug=int(os.getenv("Debug", "0"))) debug=int(os.getenv("Debug", "0")))
pass_time = time.time() - pass_start pass_time = time.time() - pass_start
print("do_dataset_training done. using time {}".format(pass_time)) print("do_dataset_training done. using time {}".format(pass_time))
if os.getenv("SAVE_MODEL") == "1":
model_dir = tempfile.mkdtemp()
fleet.save_inference_model(exe, model_dir,
[feed.name for feed in self.feeds],
self.avg_cost)
self.check_model_right(model_dir)
shutil.rmtree(model_dir)
fleet.stop_worker() fleet.stop_worker()
print("do_dataset_training stop worker.") print("do_dataset_training stop worker.")
......
...@@ -217,6 +217,9 @@ class OpTest(unittest.TestCase): ...@@ -217,6 +217,9 @@ class OpTest(unittest.TestCase):
return False return False
return True return True
def is_xpu_op_test():
return hasattr(cls, "use_xpu") and cls.use_xpu == True
def is_mkldnn_op_test(): def is_mkldnn_op_test():
return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
...@@ -239,6 +242,7 @@ class OpTest(unittest.TestCase): ...@@ -239,6 +242,7 @@ class OpTest(unittest.TestCase):
if cls.dtype in [np.float32, np.float64] \ if cls.dtype in [np.float32, np.float64] \
and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \ and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
and not hasattr(cls, 'exist_fp64_check_grad') \ and not hasattr(cls, 'exist_fp64_check_grad') \
and not is_xpu_op_test() \
and not is_mkldnn_op_test(): and not is_mkldnn_op_test():
raise AssertionError( raise AssertionError(
"This test of %s op needs check_grad with fp64 precision." % "This test of %s op needs check_grad with fp64 precision." %
...@@ -336,6 +340,11 @@ class OpTest(unittest.TestCase): ...@@ -336,6 +340,11 @@ class OpTest(unittest.TestCase):
self.attrs["use_mkldnn"] == True): self.attrs["use_mkldnn"] == True):
self.__class__.use_mkldnn = True self.__class__.use_mkldnn = True
if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
(hasattr(self, "attrs") and "use_xpu" in self.attrs and \
self.attrs["use_xpu"] == True):
self.__class__.use_xpu = True
op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
"infer datatype from inputs and outputs for this test case" "infer datatype from inputs and outputs for this test case"
self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
...@@ -932,6 +941,8 @@ class OpTest(unittest.TestCase): ...@@ -932,6 +941,8 @@ class OpTest(unittest.TestCase):
need_run_ops = self._get_need_run_ops(op_desc) need_run_ops = self._get_need_run_ops(op_desc)
res = {} res = {}
if hasattr(self, 'attrs') and bool(self.attrs.get('use_xpu', False)):
return
for op_desc, father_op_desc in reversed(need_run_ops): for op_desc, father_op_desc in reversed(need_run_ops):
# The first one is the forward op # The first one is the forward op
has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type()) has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
...@@ -1203,6 +1214,11 @@ class OpTest(unittest.TestCase): ...@@ -1203,6 +1214,11 @@ class OpTest(unittest.TestCase):
self.attrs["use_mkldnn"] == True): self.attrs["use_mkldnn"] == True):
self.__class__.use_mkldnn = True self.__class__.use_mkldnn = True
if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
(hasattr(self, "attrs") and "use_xpu" in self.attrs and \
self.attrs["use_xpu"] == True):
self.__class__.use_xpu = True
places = self._get_places() places = self._get_places()
for place in places: for place in places:
res = self.check_output_with_place(place, atol, no_check_set, res = self.check_output_with_place(place, atol, no_check_set,
......
...@@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase): ...@@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
class TestELUDoubleGradCheck(unittest.TestCase): class TestELUDoubleGradCheck(unittest.TestCase):
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
shape = [2, 3, 7, 9] shape = [2, 3, 6, 6]
eps = 1e-6 eps = 1e-6
alpha = 1.1 alpha = 1.1
dtype = np.float64 dtype = np.float64
SEED = 0
x = layers.data('x', shape, False, dtype) x = layers.data('x', shape, False, dtype)
x.persistable = True x.persistable = True
y = layers.elu(x, alpha=alpha) y = layers.elu(x, alpha=alpha)
np.random.RandomState(SEED)
x_arr = np.random.uniform(-1, 1, shape).astype(dtype) x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
gradient_checker.double_grad_check( gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps) [x], y, x_init=x_arr, place=place, eps=eps)
...@@ -171,5 +173,29 @@ class TestAbsDoubleGradCheck(unittest.TestCase): ...@@ -171,5 +173,29 @@ class TestAbsDoubleGradCheck(unittest.TestCase):
self.func(p) self.func(p)
class TestLogDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
shape = [2, 3, 7, 9]
eps = 1e-6
dtype = np.float64
x = layers.data('x', shape, False, dtype)
x.persistable = True
y = layers.log(x)
x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -238,7 +238,7 @@ class TestTanhAPI(unittest.TestCase): ...@@ -238,7 +238,7 @@ class TestTanhAPI(unittest.TestCase):
def test_dygraph_api(self): def test_dygraph_api(self):
paddle.disable_static(self.place) paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out1 = F.tanh(x) out1 = F.tanh(x)
out2 = paddle.tanh(x) out2 = paddle.tanh(x)
th = paddle.nn.Tanh() th = paddle.nn.Tanh()
...@@ -596,7 +596,7 @@ class TestHardShrinkAPI(unittest.TestCase): ...@@ -596,7 +596,7 @@ class TestHardShrinkAPI(unittest.TestCase):
def test_dygraph_api(self): def test_dygraph_api(self):
paddle.disable_static(self.place) paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out1 = F.hardshrink(x) out1 = F.hardshrink(x)
hd = paddle.nn.Hardshrink() hd = paddle.nn.Hardshrink()
out2 = hd(x) out2 = hd(x)
...@@ -666,7 +666,7 @@ class TestHardtanhAPI(unittest.TestCase): ...@@ -666,7 +666,7 @@ class TestHardtanhAPI(unittest.TestCase):
def test_dygraph_api(self): def test_dygraph_api(self):
paddle.disable_static(self.place) paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out1 = F.hardtanh(x) out1 = F.hardtanh(x)
m = paddle.nn.Hardtanh() m = paddle.nn.Hardtanh()
out2 = m(x) out2 = m(x)
...@@ -1112,7 +1112,7 @@ class TestLeakyReluAPI(unittest.TestCase): ...@@ -1112,7 +1112,7 @@ class TestLeakyReluAPI(unittest.TestCase):
def test_dygraph_api(self): def test_dygraph_api(self):
paddle.disable_static(self.place) paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out1 = F.leaky_relu(x) out1 = F.leaky_relu(x)
m = paddle.nn.LeakyReLU() m = paddle.nn.LeakyReLU()
out2 = m(x) out2 = m(x)
......
...@@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase): ...@@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase):
def test_adamax_api_dygraph(self): def test_adamax_api_dygraph(self):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.Adamax( adam = paddle.optimizer.Adamax(
learning_rate=0.01, learning_rate=0.01,
......
...@@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase): ...@@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase):
def test_adamw_op_dygraph(self): def test_adamw_op_dygraph(self):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.AdamW( adam = paddle.optimizer.AdamW(
learning_rate=0.01, learning_rate=0.01,
...@@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase): ...@@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase):
def test_adamw_op_coverage(self): def test_adamw_op_coverage(self):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.AdamW( adam = paddle.optimizer.AdamW(
learning_rate=0.0, learning_rate=0.0,
......
...@@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase): ...@@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_avg_pool2d( out_1 = paddle.nn.functional.adaptive_avg_pool2d(
x=x, output_size=[3, 3]) x=x, output_size=[3, 3])
...@@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase): ...@@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3]) adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
out_1 = adaptive_avg_pool(x=x) out_1 = adaptive_avg_pool(x=x)
......
...@@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase): ...@@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_avg_pool3d( out_1 = paddle.nn.functional.adaptive_avg_pool3d(
x=x, output_size=[3, 3, 3]) x=x, output_size=[3, 3, 3])
...@@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase): ...@@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d( adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
output_size=[3, 3, 3]) output_size=[3, 3, 3])
......
...@@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase): ...@@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_max_pool2d( out_1 = paddle.nn.functional.adaptive_max_pool2d(
x=x, return_indices=False, output_size=[3, 3]) x=x, return_indices=False, output_size=[3, 3])
...@@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase): ...@@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3]) adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
out_1 = adaptive_max_pool(x=x) out_1 = adaptive_max_pool(x=x)
......
...@@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase): ...@@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_max_pool3d( out_1 = paddle.nn.functional.adaptive_max_pool3d(
x=x, output_size=[3, 3, 3]) x=x, output_size=[3, 3, 3])
...@@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase): ...@@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]): if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place) paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np) x = paddle.to_tensor(self.x_np)
adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d( adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
output_size=[3, 3, 3]) output_size=[3, 3, 3])
......
...@@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase): ...@@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase):
def test_error1(): def test_error1():
data_x_wrong = np.ones((2, 3)).astype(np.float32) data_x_wrong = np.ones((2, 3)).astype(np.float32)
x = paddle.to_variable(data_x_wrong) x = paddle.to_tensor(data_x_wrong)
y = paddle.to_variable(data_y) y = paddle.to_tensor(data_y)
input = paddle.to_variable(data_input) input = paddle.to_tensor(data_input)
out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 ) out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
self.assertRaises(ValueError, test_error1) self.assertRaises(ValueError, test_error1)
''' '''
......
...@@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase): ...@@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase):
x2 = paddle.tensor.arange(5) x2 = paddle.tensor.arange(5)
x3 = paddle.tensor.creation.arange(5) x3 = paddle.tensor.creation.arange(5)
start = paddle.to_variable(np.array([0], 'float32')) start = paddle.to_tensor(np.array([0], 'float32'))
end = paddle.to_variable(np.array([5], 'float32')) end = paddle.to_tensor(np.array([5], 'float32'))
step = paddle.to_variable(np.array([1], 'float32')) step = paddle.to_tensor(np.array([1], 'float32'))
x4 = paddle.arange(start, end, step, 'int64') x4 = paddle.arange(start, end, step, 'int64')
paddle.enable_static() paddle.enable_static()
......
...@@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase): ...@@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase):
a = np.random.rand(3, 3) a = np.random.rand(3, 3)
a_t = np.transpose(a, [1, 0]) a_t = np.transpose(a, [1, 0])
x_data = np.matmul(a, a_t) + 1e-03 x_data = np.matmul(a, a_t) + 1e-03
x = paddle.to_variable(x_data) x = paddle.to_tensor(x_data)
out = paddle.cholesky(x, upper=False) out = paddle.cholesky(x, upper=False)
......
...@@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase): ...@@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase):
paddle.disable_static(place) paddle.disable_static(place)
data_shape = [1, 9, 9, 4] data_shape = [1, 9, 9, 4]
data = np.random.random(data_shape).astype('float32') data = np.random.random(data_shape).astype('float32')
images = paddle.to_variable(data, dtype='float32') images = paddle.to_tensor(data, dtype='float32')
v_min = paddle.to_variable(np.array([0.2], dtype=np.float32)) v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
v_max = paddle.to_variable(np.array([0.8], dtype=np.float32)) v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
out_1 = paddle.clip(images, min=0.2, max=0.8) out_1 = paddle.clip(images, min=0.2, max=0.8)
out_2 = paddle.clip(images, min=0.2, max=0.9) out_2 = paddle.clip(images, min=0.2, max=0.9)
......
...@@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase): ...@@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True strategy.a_sync = True
strategy.a_sync_configs = {"k_steps": 100} strategy.a_sync_configs = {"k_steps": 100}
strategy.a_sync_configs = {"launch_barrier": False}
if training_role == "TRAINER": if training_role == "TRAINER":
self.run_trainer(role, strategy) self.run_trainer(role, strategy)
......
...@@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase): ...@@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False strategy.a_sync = False
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
...@@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase): ...@@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase):
in2 = np.array([[11, 12, 13], [14, 15, 16]]) in2 = np.array([[11, 12, 13], [14, 15, 16]])
in3 = np.array([[21, 22], [23, 24]]) in3 = np.array([[21, 22], [23, 24]])
paddle.disable_static() paddle.disable_static()
x1 = paddle.to_variable(in1) x1 = paddle.to_tensor(in1)
x2 = paddle.to_variable(in2) x2 = paddle.to_tensor(in2)
x3 = paddle.to_variable(in3) x3 = paddle.to_tensor(in3)
out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1) out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
out2 = paddle.concat(x=[x1, x2], axis=0) out2 = paddle.concat(x=[x1, x2], axis=0)
np_out1 = np.concatenate([in1, in2, in3], axis=-1) np_out1 = np.concatenate([in1, in2, in3], axis=-1)
......
...@@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): ...@@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_x2 = np.random.rand(*shape).astype(np.float32) np_x2 = np.random.rand(*shape).astype(np.float32)
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1) tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_variable(np_x2) tesnor_x2 = paddle.to_tensor(np_x2)
y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps) y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
self.assertTrue(np.allclose(y.numpy(), np_out)) self.assertTrue(np.allclose(y.numpy(), np_out))
...@@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): ...@@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_x2 = np.random.rand(*shape).astype(np.float32) np_x2 = np.random.rand(*shape).astype(np.float32)
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1) tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_variable(np_x2) tesnor_x2 = paddle.to_tensor(np_x2)
y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps) y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
self.assertTrue(np.allclose(y.numpy(), np_out)) self.assertTrue(np.allclose(y.numpy(), np_out))
...@@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): ...@@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_x2 = np.random.rand(*shape2).astype(np.float32) np_x2 = np.random.rand(*shape2).astype(np.float32)
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1) tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_variable(np_x2) tesnor_x2 = paddle.to_tensor(np_x2)
y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps) y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
self.assertTrue(np.allclose(y.numpy(), np_out)) self.assertTrue(np.allclose(y.numpy(), np_out))
...@@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): ...@@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps) cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1) tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_variable(np_x2) tesnor_x2 = paddle.to_tensor(np_x2)
y = cos_sim_func(tesnor_x1, tesnor_x2) y = cos_sim_func(tesnor_x1, tesnor_x2)
self.assertTrue(np.allclose(y.numpy(), np_out)) self.assertTrue(np.allclose(y.numpy(), np_out))
......
...@@ -21,13 +21,12 @@ import paddle ...@@ -21,13 +21,12 @@ import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard from paddle.fluid import compiler, Program, program_guard
from paddle import to_variable
class TestCumsumOp(unittest.TestCase): class TestCumsumOp(unittest.TestCase):
def run_cases(self): def run_cases(self):
data_np = np.arange(12).reshape(3, 4) data_np = np.arange(12).reshape(3, 4)
data = to_variable(data_np) data = paddle.to_tensor(data_np)
y = paddle.cumsum(data) y = paddle.cumsum(data)
z = np.cumsum(data_np) z = np.cumsum(data_np)
......
...@@ -20,7 +20,6 @@ import paddle ...@@ -20,7 +20,6 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear from paddle.fluid.dygraph import Linear
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle import to_variable
class TestDefaultType(unittest.TestCase): class TestDefaultType(unittest.TestCase):
......
...@@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase):
def test_new_directory(self): def test_new_directory(self):
new_directory = [ new_directory = [
'paddle.enable_static', 'paddle.disable_static', 'paddle.enable_static', 'paddle.disable_static',
'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad', 'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad',
'paddle.no_grad', 'paddle.save', 'paddle.load', 'paddle.no_grad', 'paddle.save', 'paddle.load',
'paddle.static.save', 'paddle.static.load', 'paddle.static.save', 'paddle.static.load',
'paddle.distributed.ParallelEnv', 'paddle.distributed.ParallelEnv',
......
...@@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True strategy.a_sync = True
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True strategy.a_sync = True
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
...@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync) self.assertTrue(fleet._final_strategy().a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0) self.assertTrue(a_sync_configs['k_steps'] == 0)
......
...@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync) self.assertTrue(fleet._final_strategy().a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0) self.assertTrue(a_sync_configs['k_steps'] == 0)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册