提交 36cecf61 编写于 作者: S Shang Zhizhou

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_test_activation_op_random_bug
......@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git)
set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984)
set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
......@@ -44,8 +44,9 @@ ExternalProject_Add(
"${WARPCTC_DOWNLOAD_CMD}"
PREFIX ${WARPCTC_PREFIX_DIR}
SOURCE_DIR ${WARPCTC_SOURCE_DIR}
UPDATE_COMMAND ""
#UPDATE_COMMAND ""
PATCH_COMMAND ""
BUILD_ALWAYS 1
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......
......@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
......@@ -62,9 +62,9 @@ function(op_library TARGET)
endif()
endif()
if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif()
endif()
else()
......@@ -83,7 +83,7 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
......
......@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.",
"%d and the number of places is %d.",
in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
......@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument(
"The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.",
"%d and the number of places is %d.",
in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data;
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
PADDLE_ENFORCE_EQ(
out_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
platform::errors::PreconditionNotMet(
"The number of inputs should be 1, but got %d.",
in_var_handles.size()));
PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
platform::errors::PreconditionNotMet(
"The number of outputs and the number of places should "
"be equal, but got the number of outputs is %d and the "
"number of places is %d.",
out_var_handles.size(), places_.size()));
VarHandle *in_var_handle = in_var_handles[0];
......@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
const std::vector<Scope *> &var_scopes) {
auto *in_var =
var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
in_var_handle.name()));
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
......@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream()));
});
......@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
nccl_ctxs_->DevCtx(p)->Wait();
}
#else
PADDLE_THROW("CUDA is not enabled.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
}
}
......@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
auto t_out_p = out_var_handle->place();
auto *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
"Places of input and output must be all on GPU.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else {
t_out_p = platform::CPUPlace();
}
......
......@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
}
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
int count = 8;
......@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
......@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
......@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
}
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
......@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
PADDLE_ENFORCE_EQ(selected_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %ld.",
height, selected_rows.height()));
for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
PADDLE_ENFORCE_EQ(
selected_rows.rows()[k], rows[k],
platform::errors::InvalidArgument(
"The item at position %zu of rows of SelectedRows "
"is not equal to the expected, expect %ld, but got %ld.",
k, rows[k], selected_rows.rows()[k]));
}
p::CPUPlace cpu_place;
......@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
framework::Scope* scope) {
p::CPUPlace cpu_place;
auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
PADDLE_ENFORCE_EQ(tensor.lod(), lod,
platform::errors::InvalidArgument(
"The LoD of tensor is not equal to "
"the expected, expect %s, but got %s.",
lod, tensor.lod()));
f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
......
......@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("reduce_mode_multi_devices_pass").get();
break;
default:
PADDLE_THROW("Unknown reduce strategy.");
PADDLE_THROW(
platform::errors::Unimplemented("Unknown reduce strategy."));
}
}
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......
......@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include <memory>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
......@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
}
}
#endif
PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
"Variable names are empty."));
PADDLE_ENFORCE_NE(vars.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted are empty."));
for (auto *var : var_infos_) {
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
"The memory optimization info is NULL."));
}
}
......@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
}
#endif
}
......@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
}
void EagerDeletionOpHandle::CallOnce() {
PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here");
PADDLE_ENFORCE_EQ(
vars_.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted should be initialized here."));
Scope *exec_scope = local_exec_scopes_[0];
for (auto *var_info : var_infos_) {
auto *var = exec_scope->FindVar(var_info->Name());
PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr",
var_info->Name());
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
var_info->Name()));
vars_.emplace_back(var);
}
}
......@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else {
PADDLE_THROW("Type %s of %s is not supported eager deletion",
framework::ToTypeName(var->Type()), var_info->Name());
PADDLE_THROW(platform::errors::Unimplemented(
"The variable(%s) of type %s is not supported in eager deletion.",
framework::ToTypeName(var->Type()), var_info->Name()));
}
}
......@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
};
gc_->Add(std::move(*garbages), callback_func);
} else {
......
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places.");
platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of places plus the number of all reduce handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of all reduce handles "
"is %d.",
in_var_handles.size(), place_num, num_of_all_reduce_));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of output variable handles, but got the number of input variable "
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
......@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
dtype = ele_dtype;
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
PADDLE_ENFORCE_EQ(
ele_dtype, dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(dtype)));
// Check whether the address space is contiguous.
std::sort(
......@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
"input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address,
"The address is not consistent.");
PADDLE_ENFORCE_EQ(
infer_next_address, next_address,
platform::errors::InvalidArgument(
"The infered address of the next tensor should be equal to the "
"real address of the next tensor. But got infered address is %p "
"and real address is %p.",
infer_next_address, next_address));
}
}
if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first);
PADDLE_ENFORCE_EQ(
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first,
platform::errors::InvalidArgument(
"The variable name of grad tensors of "
"fused_all_reduce_op_handle "
"must be consistent. The current name is %s, but the "
"previous name is %s.",
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first));
}
}
}
......@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true;
......@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
PADDLE_ENFORCE_EQ(
var_name, out_var_handles[j]->name(),
platform::errors::InvalidArgument(
"The name of input variable should be equal "
"to the name of output variable. But got the name of input "
"variable is %s and the name of output variable is %s.",
var_name, out_var_handles[j]->name()));
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
true, "%s(%d) is not in the right place.", var_name, scope_idx);
true, platform::errors::InvalidArgument(
"The variable '%s' at scope %d is not in the right place.",
var_name, scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
}
}
......@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype
auto ele_type = grad_tensor.at(i).second->type();
auto ele_dtype = grad_tensor.at(i).second->type();
if (i == 0) {
*dtype = ele_type;
size_of_dtype = framework::SizeOfType(ele_type);
*dtype = ele_dtype;
size_of_dtype = framework::SizeOfType(ele_dtype);
}
PADDLE_ENFORCE_EQ(ele_type, *dtype);
PADDLE_ENFORCE_EQ(
ele_dtype, *dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
PADDLE_ENFORCE_GT(
len, 0, platform::errors::InvalidArgument(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
WaitInputVarGenerated();
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
PADDLE_ENFORCE_EQ(
in_var_handles.size() * place_num, out_var_handles.size(),
platform::errors::PreconditionNotMet(
"The number of input variable handles plus the number "
"of places should be equal to the number of output variable handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of output variable handles "
"is %d.",
in_var_handles.size(), place_num, out_var_handles.size()));
for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar(
......
......@@ -13,8 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
......@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW("CUDA is not supported.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
platform::errors::InvalidArgument(
"The number of input variables should be equal "
"to the number of places, but got the number of input variables is "
"%d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
PADDLE_ENFORCE_EQ(
out_var_handles.size(), 1,
platform::errors::InvalidArgument(
"The number of output variables should be 1, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front();
}
......@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
auto in_0_handle = in_var_handles[0];
auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(
pre_in_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
in_0_handle->name()));
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
"Currently, gather_op only can gather SelectedRows.");
PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
platform::errors::Unimplemented(
"Currently, gather_op only supports SelectedRows."));
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated();
......@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
for (auto *in_handle : in_var_handles) {
auto *in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var,
platform::errors::NotFound(
"The variable '%s' is not found in the scope.", in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
auto &in_sr_value = in_var->Get<framework::SelectedRows>();
......@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform::Place t_out_p = out_var_handle->place();
if (platform::is_gpu_place(pre_in_value.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
"Places of input and output must be all on GPU.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else {
t_out_p = platform::CPUPlace();
}
auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
out_var_handle->name()));
auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows);
......
......@@ -13,8 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h"
namespace paddle {
......@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
int count = 8;
......@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
}
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
......@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
}
f::Tensor result_tensor;
......@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
test_op.TestGatherSelectedRows(input_scope_idx);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
}
virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
}
for (auto& ev : exter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
}
}
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0");
PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce;
......@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
return;
}
PADDLE_ENFORCE(places_.size() == 1,
"HierarchicalAllReduce run one proc with one card mode.");
PADDLE_ENFORCE_EQ(places_.size(), 1,
platform::errors::InvalidArgument(
"HierarchicalAllReduce can only run "
"one proccess with one card mode, but got %d cards.",
places_.size()));
for (auto& p : places_) {
auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
......@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
continue;
}
PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id],
cudaEventDisableTiming));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id],
cudaEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&exter_events_[dev_id], cudaEventDisableTiming));
VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id];
......@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
......@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place;
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream));
}
void NCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
if (!use_hierarchical_allreduce_) {
FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return;
......@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
InterReduce(place, sendbuff, recvbuff, count, datatype, op);
// When a trainer is not in exter allreduce ring
// they need not to call this.
......@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream;
PADDLE_ENFORCE(platform::dynload::ncclReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
cudaEventRecord(inter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream),
"sync HierarchicalAllReduce inter stream error");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
}
......@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_);
PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
......@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream));
cudaEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream),
"sync HierarchicalAllReduce exter stream error");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
}
......@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", stream:" << stream;
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0,
comm, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
sendbuff, count, datatype, 0, comm, stream));
}
protected:
......
......@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA
for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
}
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
......@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
}
}
} else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
"%s should have only one dev_ctx.", Name());
PADDLE_ENFORCE_EQ(
dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()),
"The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name());
PADDLE_ENFORCE_EQ(
platform::is_same_place(place, out_var_handle->place()), true,
platform::errors::InvalidArgument(
"The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name()));
out_var_handle->SetGenerateEvent(events_.at(dev_id));
}
}
......@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
InitCUDA();
}
#else
PADDLE_ENFORCE(!use_cuda);
PADDLE_ENFORCE_EQ(use_cuda, false,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
#endif
// skip running current op, used with inplace_addto_op_pass
......@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx);
PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
PADDLE_ENFORCE_NOT_NULL(
dev_ctx.second,
platform::errors::InvalidArgument("The device context is NULL."));
dev_ctx.second->Wait();
}
} else {
auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
}
}
#else
......@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream();
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
}
// There are nothing to do when the place is CPUPlace.
......@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place()))
->stream();
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
}
// There are nothing to do when the place is CPUPlace.
......@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
auto scopes = GetLocalScopes();
for (auto *scope : scopes) {
auto iter = scope_map.find(scope);
PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found");
PADDLE_ENFORCE_NE(
iter, scope_map.end(),
platform::errors::NotFound("Local scope not found in scope map."));
local_exec_scopes_.emplace_back(iter->second);
}
}
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
......@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const {
PADDLE_ENFORCE_EQ(info->proto_, nullptr,
platform::errors::AlreadyExists(
"OpProto of %s has been registered", op_type));
"OpProto of %s has been registered.", op_type));
PADDLE_ENFORCE_EQ(info->checker_, nullptr,
platform::errors::AlreadyExists(
"OpAttrChecker of %s has been registered", op_type));
"OpAttrChecker of %s has been registered.", op_type));
info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker();
T maker;
maker(info->proto_, info->checker_);
info->proto_->set_type(op_type);
PADDLE_ENFORCE(
info->proto_->IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, info->proto_->InitializationErrorString());
PADDLE_ENFORCE_EQ(
info->proto_->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"Fail to initialize %s's OpProto, because %s is not initialized.",
op_type, info->proto_->InitializationErrorString()));
}
};
......
......@@ -16,6 +16,7 @@
#include <algorithm>
#include <map>
#include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
......@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
template <typename T>
void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty());
PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
platform::errors::InvalidArgument(
"The number of tensors to be reduced is 0."));
auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
PADDLE_ENFORCE_NE(t0.numel(), 0,
platform::errors::InvalidArgument(
"The size of first tensor to be reduced is 0."));
dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
......@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
continue;
}
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
PADDLE_ENFORCE_EQ(t.type(), t0.type());
PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
platform::errors::InvalidArgument(
"The shape of tensors to be reduced must be "
"consistent. The shape of current tensor is %s, "
"but the shape of the first tensor is %s.",
t.dims(), t0.dims()));
PADDLE_ENFORCE_EQ(t.type(), t0.type(),
platform::errors::InvalidArgument(
"The type of tensors to be reduced must be "
"consistent. The type of current tensor is %s, "
"but the type of the first tensor is %s.",
t.type(), t0.type()));
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; });
}
......@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
in_places_(in_places),
out_place_(out_place),
dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
platform::errors::InvalidArgument(
"The number of selected_rows to be gathered is 0."));
std::vector<int64_t> out_rows;
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <memory>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
PADDLE_ENFORCE(remote.size() == vars.size());
PADDLE_ENFORCE_EQ(
client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows.
std::vector<const SelectedRows *> all;
......@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
platform::errors::InvalidArgument(
"The number of inputs should equal to the number of places, but got "
"the number of inputs is %d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one.");
platform::errors::InvalidArgument(
"The number of output should be one, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front();
}
......@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
"Variable %s is not found in scope.",
in_0_handle->name()));
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx
......@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
in_places.emplace_back(in_handle->place());
auto in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
}
auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound("Variable %s is not found in scope.",
out_var_handle->name()));
// NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()),
"Places of input and output must be all on GPU.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
t_out_p = out_var_handle->place();
} else {
t_out_p = platform::CPUPlace();
......@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
} else {
PADDLE_THROW("only support double or float when gather SelectedRows");
PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
}
#endif
});
......@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back(
[buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
});
......@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
}
});
#else
PADDLE_THROW("CUDA is not enabled.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
lod_tensors[0]->place()));
}
}
}
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
}
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
int count = 8;
......@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
......@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
}
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
......@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
auto &out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
}
f::Tensor result_tensor;
......@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
}
}
} // namespace details
void TestReduceLodTensors(size_t output_scope_idx) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
......@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod);
......@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
}
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
......@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
}
}
};
}; // namespace details
TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
......
......@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() {
auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The input variable(%s)to be inplaced should not be NULL.",
"The variable(%s) to be inplaced is not found in scope.",
in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"The output variable(%s) to be inplaced should not be NULL.",
out_var_names_[i]));
out_var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
out_var_names_[i]));
PADDLE_ENFORCE_NE(
in_var, out_var,
platform::errors::PreconditionNotMet(
......
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
#include <algorithm>
#include <utility>
#include "dgc/dgc.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
......@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_(is_encoded),
nranks_(nranks) {
// TODO(gongwb) :polish them!
PADDLE_ENFORCE_EQ(is_encoded, true);
PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
"The argument is_encoded is false."));
VLOG(1) << "Use dgc allreduce mode"
<< ", nranks:" << nranks_;
PADDLE_ENFORCE_GT(local_scopes_.size(), 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto nranks_name = g_dgc_nranks;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *local_scope = local_scopes_[i];
auto nranks_var = local_scope->FindVar(nranks_name);
if (nranks_var == nullptr) {
PADDLE_THROW("not find nranks_var:%s", nranks_name);
}
PADDLE_ENFORCE_NOT_NULL(
nranks_var, platform::errors::NotFound(
"Variable %s is not found in scope.", nranks_name));
float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
*dgc_nranks = nranks;
......@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places.");
platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"places, but got the number of input variables is %zu and the the "
"number of places is %zu.",
in_var_handles.size(), places_.size()));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"output variables, but got the number of input variables is %zu and "
"the the number of output variables is %zu.",
in_var_handles.size(), out_var_handles.size()));
std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> gathers;
......@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
encode_var_name));
auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in);
auto gather_var_name = original_name + g_dgc_gather;
auto *gather_var = local_scope->FindVar(gather_var_name);
PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null",
gather_var_name);
PADDLE_ENFORCE_NOT_NULL(
gather_var, platform::errors::NotFound(
"Variable %s is not found in scope.", gather_var));
auto *gather = gather_var->GetMutable<LoDTensor>();
gathers.emplace_back(gather);
......@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
}
}
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ins[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
ins[0]->place()));
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(outs[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
outs[0]->place()));
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
"The nccl contexts are NULL."));
int dtype = -1;
size_t in_numel = 0;
size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1);
PADDLE_ENFORCE_GT(
nranks_, 1,
platform::errors::PreconditionNotMet(
"The number of ranks should be > 1, but got %d.", nranks_));
std::vector<std::function<void()>> all_gather_calls;
std::vector<std::function<void()>> sparse_reduce_calls;
......@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0);
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
PADDLE_ENFORCE_EQ(in_numel % 2, 0,
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
......@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
gather_buff, k, out_tensor_buf,
static_cast<int>(out_numel), nranks_, stream),
true);
true, platform::errors::Unavailable(
"Calling sparseReduce() failed."));
});
}
......@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *scope = local_exec_scopes_[0];
auto var = scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor;
}
......@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
}
auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *local_scope = local_exec_scopes_[0];
auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
step_var);
}
PADDLE_ENFORCE_NOT_NULL(
count_var, platform::errors::NotFound(
"Variable %s is not found in scope.", counter_name));
PADDLE_ENFORCE_NOT_NULL(
step_var, platform::errors::NotFound("Variable %s is not found in scope.",
step_var));
float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>();
......
......@@ -97,6 +97,7 @@ message AsyncConfig {
optional int32 thread_pool_size = 6 [ default = 1 ];
optional int32 send_wait_times = 7 [ default = 1 ];
optional bool runtime_split_send_recv = 8 [ default = false ];
optional bool launch_barrier = 9 [ default = true ];
}
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
......
......@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], no enough memory", ret));
if (FLAGS_init_allocated_mem) {
PADDLE_THROW(platform::errors::Unimplemented(
"xpu memory FLAGS_init_allocated_mem is not implemented."));
......
......@@ -891,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
}
};
// log Grad: dx = dout / x
// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x)
template <typename T>
class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("log_grad_grad");
op->SetInput("X", this->Input("X"));
// X@GRAD@GRAD: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
op->SetAttrMap(this->Attrs());
// X@GRAD: dx
op->SetOutput("DX", this->InputGrad("X"));
// Out@GRAD@GRAD: ddy
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer,
{framework::GradVarName("Out"),
framework::GradVarName("X")});
......@@ -1272,6 +1294,35 @@ REGISTER_OP_CPU_KERNEL(
ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_OPERATOR(
log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
paddle::framework::OpDesc>,
ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
paddle::imperative::OpBase>,
ops::ActFwdInplaceInferer);
REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer,
ops::LogDoubleGradMaker<paddle::framework::OpDesc>,
ops::LogDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(
log_grad_grad,
ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CPU_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CPUDeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(leaky_relu)
.AddCheckpoint(
......
......@@ -193,3 +193,15 @@ REGISTER_OP_CUDA_KERNEL(
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */
/* ========================== Log register ==================================*/
REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
REGISTER_OP_CUDA_KERNEL(
log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<float>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<double>>,
ops::LogDoubleGradKernel<plat::CUDADeviceContext,
ops::LogGradGradFunctor<plat::float16>>);
/* ========================================================================== */
......@@ -1663,6 +1663,10 @@ class SquareDoubleGradKernel
}
};
template <typename DeviceContext, typename Functor>
class LogDoubleGradKernel
: public SquareDoubleGradKernel<DeviceContext, Functor> {};
template <typename DeviceContext, typename Functor>
class ELUDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
......@@ -1852,6 +1856,37 @@ class PowGradKernel
functor(*place, x, out, dout, dx);
}
};
template <typename T>
struct LogGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* ddX, framework::Tensor* ddOut,
const framework::Tensor* dOut, framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
// ddout = ddx / x; dx = -(dout / x) * (ddx / x)
// calculate dx first, so ddout can inplace ddx
if (dX) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
auto dx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
ddout.device(*d) = ddx * static_cast<T>(1) / x;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
};
} // namespace operators
} // namespace paddle
......@@ -1872,7 +1907,6 @@ class PowGradKernel
__macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, Log, LogFunctor, LogGradFunctor); \
__macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \
__macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/activation_op.h"
#include <string>
#include "paddle/fluid/platform/xpu_header.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
template <typename Functor>
class XPUActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename Functor>
class XPUActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename DeviceContext, typename T>
void xpu_activation_forward(const framework::ExecutionContext &ctx,
xpu::Activation_t type) {
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
int r = 0;
if (xpu::Activation_t::ACT_POW == type.type) {
type.pow_factor = ctx.Attr<float>("factor");
}
auto xpu_context = ctx.device_context<DeviceContext>().x_context();
r = xpu::activation_forward(xpu_context, type, x->numel(),
reinterpret_cast<const float *>(x_data),
reinterpret_cast<float *>(y_data));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
template <typename DeviceContext, typename T>
void xpu_activation_backward(const framework::ExecutionContext &ctx,
xpu::Activation_t type) {
/* TODO: relu tanh sigmoid are inplace */
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Out");
auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const T *x_data = nullptr;
const T *y_data = nullptr;
const T *y_grad = nullptr;
if (x != nullptr) x_data = x->data<T>();
if (y != nullptr) y_data = y->data<T>();
if (dOut != nullptr) y_grad = dOut->data<T>();
T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
auto xpu_context = ctx.device_context<DeviceContext>().x_context();
int r = xpu::activation_backward(xpu_context, type, dX->numel(),
reinterpret_cast<const float *>(x_data),
reinterpret_cast<const float *>(y_data),
reinterpret_cast<const float *>(y_grad),
reinterpret_cast<float *>(x_grad));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
template <typename T, xpu::Activation_t::act_enum algorithm>
struct XPUActivationFunc : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
algorithm);
}
};
template <typename T, xpu::Activation_t::act_enum algorithm>
struct XPUActivationGradFunc : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(ctx,
algorithm);
}
};
template <typename T>
using XPUReluFunctor = XPUActivationFunc<T, xpu::Activation_t::RELU>;
template <typename T>
using XPUSigmoidFunctor = XPUActivationFunc<T, xpu::Activation_t::SIGMOID>;
template <typename T>
using XPUTanhFunctor = XPUActivationFunc<T, xpu::Activation_t::TANH>;
template <typename T>
using XPUGeluFunctor = XPUActivationFunc<T, xpu::Activation_t::GELU>;
template <typename T>
using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
template <typename T>
using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
template <typename T>
using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
template <typename T>
using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
template <typename T>
using XPUSigmoidGradFunctor =
XPUActivationGradFunc<T, xpu::Activation_t::SIGMOID>;
template <typename T>
using XPUTanhGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::TANH>;
template <typename T>
using XPUGeluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::GELU>;
template <typename T>
using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
template <typename T>
using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
template <typename T>
using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
template <typename T>
using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_XPU_KERNEL(act_type, \
ops::XPUActivationKernel<ops::functor<float>>); \
REGISTER_OP_XPU_KERNEL( \
act_type##_grad, \
ops::XPUActivationGradKernel<ops::grad_functor<float>>);
REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
XPUSigmoidGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
REGISTER_OP_XPU_KERNEL(log,
ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
REGISTER_OP_XPU_KERNEL(pow,
ops::XPUActivationKernel<ops::XPUACTPowFunctor<float>>);
REGISTER_OP_XPU_KERNEL(abs,
ops::XPUActivationKernel<ops::XPUABSFunctor<float>>);
#endif // PADDLE_WITH_XPU
......@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
while (idy < K) {
int64_t id = ids[idy];
PADDLE_ENFORCE(id >= 0, "received id:", id);
PADDLE_ENFORCE(id < N, "received id:", id);
PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
id);
T *out = centers_diff + idy * D;
const T *x = X + idy * D;
const T *cent = centers + id * D;
......@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
while (idy < K) {
int count = 1;
int64_t id = ids[idy];
PADDLE_ENFORCE(id >= 0, "received id:", id);
PADDLE_ENFORCE(id < N, "received id:", id);
PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
id);
for (int i = 0; i < K; i++) {
if (ids[i] == id) {
......
......@@ -69,8 +69,10 @@ template <typename T>
class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"CTCAlign operator CUDA kernel must use CUDAPlace "
"rather than CPUPlace."));
auto* input = ctx.Input<LoDTensor>("Input");
auto* output = ctx.Output<LoDTensor>("Output");
const int blank = ctx.Attr<int>("blank");
......
......@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
// check input dims and lod
PADDLE_ENFORCE_EQ(
input_dims[0], static_cast<int64_t>(input_lod[level].back()),
"The first dimension of Input(Input) should be equal to "
"the sum of all sequences' lengths.");
platform::errors::InvalidArgument(
"The first dimension %d of CTCAlign operator Input(Input) should "
"be equal to "
"the sum of all sequences' lengths %d.",
input_dims[0], static_cast<int64_t>(input_lod[level].back())));
const size_t num_sequences = input_lod[level].size() - 1;
......
......@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx,
template <typename T>
void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto cpu_place = platform::CPUPlace();
auto &cpu_ctx = *pool.Get(cpu_place);
distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
......@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
if (rpc_ctx.origin_varnames.size() == 1 &&
rpc_ctx.splited_varnames.size() == 1) {
auto varname = rpc_ctx.origin_varnames[0];
VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0];
rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx,
const auto place =
scope.FindVar(varname)->Get<framework::LoDTensor>().place();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place);
VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
<< platform::is_gpu_place(place);
rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
scope, varname, varname));
for (size_t i = 0; i < rets.size(); i++) {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
XPUElementwise<T, XPUAddFunctor<T>>(ctx);
}
};
template <typename DeviceContext, typename T>
class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto dx_dims = dout->dims();
auto dy_dims_untrimed = dout->dims();
T *dx_data = NULL;
T *dy_data = NULL;
int axis = ctx.Attr<int>("axis");
PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
if (dx != nullptr) {
dx->mutable_data<T>(ctx.GetPlace());
dx_dims = dx->dims();
dx_data = dx->data<T>();
}
if (dy != nullptr) {
dy->mutable_data<T>(ctx.GetPlace());
dy_dims_untrimed = dy->dims();
dy_data = dy->data<T>();
}
int pre, n, post, is_common_broadcast;
if (dx_dims == dy_dims_untrimed) {
pre = post = 1;
n = dout->numel();
} else {
axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
"Axis should be in range [0, dx_dims)");
auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
&is_common_broadcast);
}
int len = pre * n * post;
auto &dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
if (post == 1) {
int r = xpu::matrix_vector_add_grad(
dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
return;
}
if (dx == nullptr) {
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
}
if (dy == nullptr) {
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
} else {
if (len != n) {
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
len * sizeof(float)),
XPU_SUCCESS, platform::errors::External(
"XPU has no enough memory"));
}
}
int r = xpu::elementwise_add_grad(
dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
if ((dy != nullptr) && (len != n)) {
r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
post, xpu::ElementwiseOp::ASSIGN);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
dev_ctx.Wait();
xpu_free(dy_data);
}
if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
dev_ctx.Wait();
}
if (dx == nullptr) {
xpu_free(dx_data);
}
if (dy == nullptr) {
xpu_free(dy_data);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
elementwise_add,
ops::ElementwiseAddXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradXPUKernel<
paddle::platform::XPUDeviceContext, float>);
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace operators {
template <typename T>
struct XPUAddFunctor {
int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
return xpu::elementwise_add(ctx, x, y, z, len);
}
};
template <typename T>
struct XPUMulFunctor {
int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
return xpu::elementwise_mul(ctx, x, y, z, len);
}
};
template <typename T, typename Functor>
void XPUElementwise(const framework::ExecutionContext& ctx) {
PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
"This kernel only runs on XPU device.");
auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE_NE(x_var, nullptr,
platform::errors::Fatal("Cannot get input Variable X"));
PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
"XPU only support LoDTensor");
auto x = x_var->Get<framework::LoDTensor>();
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
auto x_dims = x.dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post, is_common_broadcast;
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
int len = pre * n * post;
const T* x_data = x.data<T>();
const T* y_data = y->data<T>();
T* z_data = z->data<T>();
T* y_broadcast = nullptr;
auto& dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
if (post == 1) {
if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
z_data, pre, n);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
return;
}
if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
z_data, pre, n);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
return;
}
}
if (pre != 1 || post != 1) {
PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
len * sizeof(T)) == XPU_SUCCESS);
int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
n, post, xpu::ElementwiseOp::ASSIGN);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
y_data = y_broadcast;
}
Functor functor;
int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
res);
if (pre != 1 || post != 1) {
dev_ctx.Wait();
xpu_free(y_broadcast);
}
}
} // namespace operators
} // namespace paddle
#endif
......@@ -46,6 +46,7 @@ class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
};
template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
......
......@@ -52,6 +52,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
};
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
if (x_dim.size() > 1) {
return x_dim;
}
return framework::make_ddim({1, x_dim[0]});
}
static framework::Tensor FoldInitDims(const framework::Tensor &input) {
auto output = input;
auto in_dims = input.dims();
if (in_dims.size() == 3) {
output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
}
return output;
}
/**
* Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
* original y_dim is returned.
*/
static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
if (y_dim.size() > 1) {
return y_dim;
}
return framework::make_ddim({y_dim[0], 1});
}
static void ReshapeTensorIntoMatrixSequence(
framework::Tensor *x, const math::MatDescriptor &descriptor) {
int64_t h, w;
h = descriptor.height_;
w = descriptor.width_;
if (descriptor.trans_) {
std::swap(w, h);
}
if (descriptor.batch_size_) {
x->Resize({descriptor.batch_size_, h, w});
} else {
x->Resize({h, w});
}
}
/**
* Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
* Out = matmul(x, y)
*
* This method will first calculate X,Y matrix sequence, and then calculate
* the out shape.
*
* Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
* The out = [BatchSize, H1, W2]
*
* If there is no batch size in `X` and `Y`, the out will be [H1, W2]
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* BatchSize.
*/
static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
framework::Tensor *y,
framework::Tensor *out, bool trans_x,
bool trans_y) {
auto x_dim = RowMatrixFromVector(x->dims());
auto y_dim = ColumnMatrixFromVector(y->dims());
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else {
out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
mat_dim_x.height_, mat_dim_y.width_});
}
ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
}
template <typename DeviceContext, typename T>
class MatMulXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *x = context.Input<framework::Tensor>("X");
auto *y = context.Input<framework::Tensor>("Y");
auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto mat_dim_a = math::CreateMatrixDescriptor(
RowMatrixFromVector(x->dims()), 0, context.Attr<bool>("transpose_X"));
auto mat_dim_b =
math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0,
context.Attr<bool>("transpose_Y"));
PADDLE_ENFORCE_EQ(
mat_dim_a.width_, mat_dim_b.height_,
platform::errors::InvalidArgument("Shape mistake in matmul_op"));
PADDLE_ENFORCE_EQ(
mat_dim_a.batch_size_, mat_dim_b.batch_size_,
platform::errors::InvalidArgument("Shape mistake in matmul_op"));
T alpha = static_cast<T>(context.Attr<float>("alpha"));
auto &dev_ctx = context.template device_context<DeviceContext>();
float *data_c = out->data<T>();
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
int r =
xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
alpha, x->data<T>(), y->data<T>(), 0.0f, data_c);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else {
// batch matmul
int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
mat_dim_b.trans_, mat_dim_a.batch_size_,
mat_dim_a.height_, mat_dim_b.width_,
mat_dim_a.width_, alpha, x->data<T>(),
y->data<T>(), data_c, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
}
};
// Reshape a rank-3 tensor from P x M x N to M x (P * N).
// (Warning: This requires transposing data and writes into new memory.)
// Identity op if the tensor is not of rank 3.
template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext &context, const framework::Tensor &input) {
auto in_dims = input.dims();
if (in_dims.size() != 3) {
return input;
}
framework::Tensor output;
output.Resize({in_dims[1], in_dims[0], in_dims[2]});
output.mutable_data<T>(context.GetPlace());
std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
static_cast<int>(in_dims[1]),
static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
in_shape_host.data(), axis_host.data(), /*ndims=*/3);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
return output;
}
// Using dimensional constraints on matrix multiplication, it is
// straight-forward to check the following table for when X and Y
// are both matrices.
//
// transpose_X | False | True | False | True
// transpose_Y | False | False | True | True
// -----------+----------+----------+----------+-----------
// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T
// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T
//
// When X is a vector of size K, we treat it instead as a matrix of shape
// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
// a matrix of shape (K, 1).
//
// When X and Y are both 3-dimensional tensors, then the first dimension
// the batch dimension can be ignored and the exact same formulas apply
// as for two matrices.
//
// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
// up with formulas like
//
// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
//
// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
// to X: (P * M) x K, dOut: (P * M) x N.
template <typename DeviceContext, typename T>
class MatMulGradXPUKernel : public framework::OpKernel<T> {
public:
void MatMul(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a,
const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace());
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
PADDLE_ENFORCE_EQ(
mat_dim_a.width_, mat_dim_b.height_,
platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
PADDLE_ENFORCE_EQ(
mat_dim_a.batch_size_, mat_dim_b.batch_size_,
platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
T alpha = static_cast<T>(context.Attr<float>("alpha"));
auto &dev_ctx = context.template device_context<DeviceContext>();
float *data_c = out->data<T>();
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
int r =
xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
alpha, a.data<T>(), b.data<T>(), 0.0f, data_c);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else {
// batch matmul
int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
mat_dim_b.trans_, mat_dim_a.batch_size_,
mat_dim_a.height_, mat_dim_b.width_,
mat_dim_a.width_, alpha, a.data<T>(),
b.data<T>(), data_c, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
}
}
void CalcInputGrad(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a,
bool is_fold_init_dims_a, const framework::Tensor &b,
bool trans_b, bool is_fold_init_dims_b,
framework::Tensor *out) const {
if (out == nullptr) return;
bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
out->dims().size() == 2;
if (!need_combine) {
MatMul(context, a, trans_a, b, trans_b, out);
} else {
auto &dev_ctx = context.template device_context<DeviceContext>();
MatMul(
context, is_fold_init_dims_a
? FoldInitDims(a)
: XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
trans_a, is_fold_init_dims_b
? FoldInitDims(b)
: XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
trans_b, out);
}
}
void Compute(const framework::ExecutionContext &context) const override {
auto x = *context.Input<framework::Tensor>("X");
auto y = *context.Input<framework::Tensor>("Y");
auto dout =
*context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
bool transpose_x = context.Attr<bool>("transpose_X");
bool transpose_y = context.Attr<bool>("transpose_Y");
ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
framework::DDim dx_dims;
if (dx) {
dx_dims = dx->dims();
if (dx_dims != x.dims()) {
dx->Resize(x.dims());
}
}
framework::DDim dy_dims;
if (dy) {
dy_dims = dy->dims();
if (dy_dims != y.dims()) {
dy->Resize(y.dims());
}
}
if (transpose_x && transpose_y) {
CalcInputGrad(context, y, true, true, dout, true, false, dx);
CalcInputGrad(context, dout, true, true, x, true, false, dy);
} else if (transpose_x) {
CalcInputGrad(context, y, false, false, dout, true, false, dx);
CalcInputGrad(context, x, false, false, dout, false, true, dy);
} else if (transpose_y) {
CalcInputGrad(context, dout, false, false, y, false, true, dx);
CalcInputGrad(context, dout, true, true, x, false, true, dy);
} else {
CalcInputGrad(context, dout, false, false, y, true, false, dx);
CalcInputGrad(context, x, true, true, dout, false, true, dy);
}
if (dx) {
if (dx_dims != x.dims()) {
dx->Resize(dx_dims);
}
}
if (dy) {
if (dy_dims != y.dims()) {
dy->Resize(dy_dims);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
REGISTER_OP_XPU_KERNEL(
matmul_grad,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
#endif
......@@ -14,11 +14,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/mul_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/mul_op.h"
namespace paddle {
namespace operators {
......
......@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Vec");
auto dim_vec = context->GetInputDim("Vec");
PADDLE_ENFORCE_EQ(
dim_x.size(), 2,
platform::errors::InvalidArgument(
"The rank of input X should be 2, but is %d", dim_x.size()));
PADDLE_ENFORCE_EQ(
dim_y.size(), 1,
dim_vec.size(), 1,
platform::errors::InvalidArgument(
"The rank of input Vec should be 1, but is %d", dim_y.size()));
PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
"The rank of input Vec should be 1, but is %d", dim_vec.size()));
PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
platform::errors::InvalidArgument(
"The length of input X' second dim should equal the "
"length of input Vec,"
" but X[%d, %d], Vec[%d]",
dim_x[0], dim_x[1], dim_y[0]));
"X's second dimension is expected to be equal to "
"Vec's first dimension"
"but recieved X'shape = [%s], Vec's shape = [%s]",
dim_x, dim_vec));
framework::DDim dim_out = framework::make_ddim({dim_x[0]});
......
......@@ -19,8 +19,8 @@ namespace paddle {
namespace operators {
template <typename T>
__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
const T *vec, T *dx) {
__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
const T *vec, T *dx) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < m * n; idx += blockDim.x * gridDim.x) {
int i = idx / n;
......@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T>
int m = dim_x[0];
int n = dim_x[1];
dx->Resize(framework::make_ddim({m * n}));
// get data ptr
const T *x_data = x->data<T>();
const T *vec_data = vec->data<T>();
const T *dout_data = dout->data<T>();
T *dx_data = dx->mutable_data<T>(context.GetPlace());
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
// calculate dx
auto stream = context.cuda_device_context().stream();
auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
MVGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
m, n, dout_data, vec_data, dx_data);
dx->Resize(framework::make_ddim({m, n}));
if (dx) {
T *dx_data = dx->mutable_data<T>(context.GetPlace());
MVGradDxCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
m, n, dout_data, vec_data, dx_data);
}
if (dvec) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
// calculate dvec
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data);
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data);
}
}
};
......
......@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> {
int m = dim_x[0];
int n = dim_x[1];
dx->Resize(framework::make_ddim({m * n}));
// get data ptr
const T *x_data = x->data<T>();
const T *vec_data = vec->data<T>();
const T *dout_data = dout->data<T>();
T *dx_data = dx->mutable_data<T>(context.GetPlace());
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) {
T *dx_data = dx->mutable_data<T>(context.GetPlace());
// calculate dx
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j)
dx_data[i * n + j] = dout_data[i] * vec_data[j];
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
dx_data[i * n + j] = dout_data[i] * vec_data[j];
}
}
}
dx->Resize(framework::make_ddim({m, n}));
if (dvec) {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
// calculate dvec
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data);
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data);
}
}
};
......
......@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of DpsgdOp should not be null.");
platform::errors::NotFound(
"Input(Param) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
"Input(Grad) of DpsgdOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
"Input(LearningRate) of DpsgdOp should not be null.");
platform::errors::NotFound(
"Input(Grad) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of DpsgdOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front(),
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Grad").front()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
"Output(ParamOut) of DpsgdOp should not be null.");
platform::errors::NotFound(
"Output(ParamOut) of DpsgdOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 dimension");
platform::errors::InvalidArgument(
"Learning rate should have 1 dimension. But Received "
"LearningRate's dims [%s].",
framework::product(lr_dims)));
auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of DpsgdOp should have same dimension");
platform::errors::InvalidArgument(
"Param and Grad input of DpsgdOp should have same dimension. But "
"received Para's dim [%s] and Grad's dim [%s].",
param_dims, ctx->GetInputDim("Grad")));
ctx->SetOutputDim("ParamOut", param_dims);
}
......
......@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type()));
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
const auto *grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type()));
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
......
......@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(param) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(grad) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Velocity"),
"Input(velocity) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of Momentum should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
"Output(VelocityOut) of Momentum should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
platform::errors::NotFound(
"Input(param) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
platform::errors::NotFound(
"Input(grad) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true,
platform::errors::NotFound(
"Input(velocity) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR,
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of Momentum should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("VelocityOut"), true,
platform::errors::NotFound(
"Output(VelocityOut) of Momentum should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.");
platform::errors::InvalidArgument(
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning_rate should be a scalar");
platform::errors::InvalidArgument(
"Learning_rate should be a scalar. But Received "
"LearningRate's dim [%s]",
framework::product(lr_dims)));
auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
"Param and Grad input of MomentumOp should have the same dimension.");
platform::errors::InvalidArgument(
"Param and Grad input of MomentumOp should have the same "
"dimension. But received Param's dim [%s] and Grad's dim [%s].",
param_dim, ctx->GetInputDim("Grad")));
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Velocity"),
"Param and Velocity of MomentumOp should have the same dimension.");
platform::errors::InvalidArgument(
"Param and Velocity of MomentumOp should have the same "
"dimension. But received Param's dim [%s] and Velocity [%s].",
param_dim, ctx->GetInputDim("Velocity")));
}
ctx->SetOutputDim("ParamOut", param_dim);
......@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> {
for_range(functor);
}
} else {
PADDLE_THROW(
string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
"gradient, but the received Variable Type is %s",
framework::ToTypeName(grad_var->Type())));
PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
};
......
......@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
"Input(MeanSquare) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Moment"),
"Input(Moment) of RmspropOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
"Output(MomentOut) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
"Output(MeanSquareOut) of RmspropOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
platform::errors::NotFound(
"Input(Param) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("MeanSquare"), true,
platform::errors::NotFound(
"Input(MeanSquare) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
platform::errors::NotFound(
"Input(Grad) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true,
platform::errors::NotFound(
"Input(Moment) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR,
platform::errors::InvalidArgument(
"The input var's type in RmspropOp should be "
"LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(param_out) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MomentOut"), true,
platform::errors::NotFound(
"Output(MomentOut) of RmspropOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MeanSquareOut"), true,
platform::errors::NotFound(
"Output(MeanSquareOut) of RmspropOp should not be null."));
if (ctx->Attrs().Get<bool>("centered")) {
PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
"Output(MeanGradOut) of RmspropOp should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasOutput("MeanGradOut"), true,
platform::errors::NotFound(
"Output(MeanGradOut) of RmspropOp should not be null."));
}
auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
"Param and grad input of RmspropOp should have the same dimension.");
platform::errors::InvalidArgument(
"Param and grad input of RmspropOp should have the same dimension. "
"But received Param's dim [%s] and Grad's dim [%s].",
param_dim, ctx->GetInputDim("Grad")));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
"Param and Momentum input of RmspropOp "
"should have the same dimension.");
platform::errors::InvalidArgument(
"Param and Momentum input of RmspropOp "
"should have the same dimension. But received "
"Param's dim [%s] and Moment [%s]",
param_dim, ctx->GetInputDim("Moment")));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
"Param and Momentum input of RmspropOp "
"should have the same dimension.");
platform::errors::InvalidArgument(
"Param and Momentum input of RmspropOp "
"should have the same dimension. But received "
"Param's dim [%s] and MeanSquare [%s]",
param_dim, ctx->GetInputDim("MeanSquare")));
auto lr_dim = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
"Learning Rate should be a scalar.");
platform::errors::InvalidArgument(
"Learning Rate of RmspropOp should be a scalar. But "
"received LearningRate's dim [%s]",
framework::product(lr_dim)));
ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim);
......
......@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
PADDLE_ENFORCE_EQ(&p_tensor, param_out,
"Param and ParamOut must be the same Tensor");
platform::errors::InvalidArgument(
"Param and ParamOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
"Moment and MomentOut must be the same Tensor");
PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
"MeanSquare and MeanSquareOut must be the same Tensor");
platform::errors::InvalidArgument(
"Moment and MomentOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(
&ms_tensor, mean_square_out,
platform::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
auto &dev_ctx = ctx.template device_context<DeviceContext>();
size_t limit = static_cast<size_t>(ms_tensor.numel());
......@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
PADDLE_ENFORCE_EQ(
&mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
......@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
PADDLE_ENFORCE_EQ(
&mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
......@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
PADDLE_ENFORCE_EQ(
&mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
......@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> {
rho, epsilon, momentum, grad_func));
}
} else {
PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in RmspropOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
};
......
......@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of SGDOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
platform::errors::NotFound(
"Input(Param) of SGDOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("Grad"), true,
platform::errors::NotFound("Input(Grad) of SGDOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of SGDOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of SGDOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.");
platform::errors::NotFound(
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element");
platform::errors::InvalidArgument(
"Learning rate should have 1 element. But received "
"LearningRate dims [%s]",
framework::product(lr_dims)));
auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
......
......@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type()));
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
paddle::framework::ToTypeName(param_var->Type())));
auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
......@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out);
PADDLE_ENFORCE_EQ(
param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp should be equal with ParamOut "
"if variable's type is SelectedRows."));
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
auto in_height = grad->height();
auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
PADDLE_ENFORCE_EQ(in_height, out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp should be "
"equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
in_height, out_dims[0]));
auto& in_value = grad->value();
auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height,
platform::errors::InvalidArgument(
"The in_row_numel of SgdOp should be equal with "
"param_out's numel / in_height."));
auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>();
......@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
out_data, in_row_numel, in_rows.size());
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
PADDLE_ENFORCE_EQ(false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in SgdOp. Excepted LodTensor or "
"SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
};
......
......@@ -44,8 +44,20 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
if (grad_var->IsType<framework::LoDTensor>()) {
const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz);
PADDLE_ENFORCE_EQ(grad->numel(), sz);
PADDLE_ENFORCE_EQ(param->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Param's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Param's "
"numel = [%s], ParamOut's numel = [%s]",
param->numel(), sz));
PADDLE_ENFORCE_EQ(grad->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Grad's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Grad's "
"numel = [%s], ParamOut's numel = [%s]",
grad->numel(), sz));
jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
......@@ -62,7 +74,11 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out);
PADDLE_ENFORCE_EQ(param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp "
"should be equal with ParamOut if variable's "
"type is SelectedRows. "));
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
auto &grad_rows = grad->rows();
......@@ -73,7 +89,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
}
auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
PADDLE_ENFORCE_EQ(
grad->height(), out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp "
"should be equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
grad->height(), out_dims[0]));
auto &grad_value = grad->value();
const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>();
......@@ -87,19 +109,31 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size();
PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
PADDLE_ENFORCE_EQ(
attr.grad_width, attr.param_width,
platform::errors::InvalidArgument(
"The grad_value's numel of SgdOp "
"should be equal with param_out's numel. But received "
"grad_value's numel [%s] and param_out's numel [%s]",
attr.grad_width, attr.param_width));
auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
PADDLE_ENFORCE_EQ(
false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
} else if (param_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
"when param "
"is SelectedRows, gradient should also be SelectedRows");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
platform::errors::InvalidArgument(
"when param is SelectedRows, "
"gradient should also be SelectedRows"));
const auto &param = param_var->Get<framework::SelectedRows>();
auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
const auto &grad = grad_var->Get<framework::SelectedRows>();
......@@ -112,27 +146,36 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
auto param_row_width = param.value().dims()[1];
auto grad_row_width = grad.value().dims()[1];
VLOG(4) << " param rows: " << param.rows().size()
<< " param memory rows: " << param.value().dims()[0]
<< " grad rows: " << grad.rows().size()
<< " grad memory rows: " << grad.value().dims()[0];
PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
"param_row should have the same size with grad_row");
PADDLE_ENFORCE_EQ(
param_row_width, grad_row_width,
platform::errors::InvalidArgument(
"The param_row in SgdOP should have the same size with grad_row. "
"But received param_row's width is [%s], and grad_row's width is "
"[%s]",
param_row_width, grad_row_width));
const auto *lr = learning_rate->data<T>();
const auto *grad_data = grad.value().data<T>();
auto *out_data = param_out->mutable_value()->data<T>();
for (size_t i = 0; i < grad.rows().size(); i++) {
int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
"id should be in the table");
PADDLE_ENFORCE_GE(
id_index, static_cast<int64_t>(0),
platform::errors::InvalidArgument(
"The id in SgdOp should be >= 0. But recevied id_index is [%s]",
id_index));
for (int64_t j = 0; j < grad_row_width; j++) {
out_data[id_index * grad_row_width + j] -=
lr[0] * grad_data[i * grad_row_width + j];
}
}
} else {
PADDLE_THROW("Unsupported Variable Type of Parameter");
PADDLE_ENFORCE_EQ(
false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Parameter in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(param_var->Type())));
}
}
};
......
......@@ -45,8 +45,10 @@ template <typename T>
class PoolCUDNNOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must use CUDAPlace.");
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
"CUDAPlace rather than CPUPlace."));
const Tensor *input = ctx.Input<Tensor>("X");
Tensor *output = ctx.Output<Tensor>("Out");
......@@ -175,8 +177,10 @@ template <typename T>
class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"It must use CUDAPlace.");
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
"CUDAPlace rather than CPUPlace."));
const Tensor *input = ctx.Input<Tensor>("X");
const Tensor *output = ctx.Input<Tensor>("Out");
......
......@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
}
PADDLE_ENFORCE_GT(
output_size, 0,
"ShapeError: the output size must be greater than 0. But received: "
"output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
"k_size(%d) and stride(%d). Please check again!",
output_size, input_size, padding_1, padding_2, filter_size, stride);
platform::errors::InvalidArgument(
"the output size must be greater than 0. But received: "
"output_size = %d due to the settings of input_size(%d), "
"padding(%d,%d), "
"k_size(%d) and stride(%d). Please check again!",
output_size, input_size, padding_1, padding_2, filter_size, stride));
return output_size;
}
void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
"X(Input) of Pooling should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
"Out(Output) of Pooling should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of Pool operator is not found."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of Pool operator is not found."));
std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
......@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
auto in_x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
"ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But "
"received: %u-D Tensor and it's shape is [%s].",
in_x_dims.size(), in_x_dims);
platform::errors::InvalidArgument(
"the input of Op(pool) should be 4-D or 5-D Tensor. But "
"received: %u-D Tensor and it's shape is [%s].",
in_x_dims.size(), in_x_dims));
PADDLE_ENFORCE_EQ(
in_x_dims.size() - ksize.size(), 2U,
"ShapeError: the dimension of input minus the size of "
"Attr(ksize) must be euqal to 2 in Op(pool). "
"But received: the dimension of input minus the size "
"of Attr(ksize) is %d, the "
"input's dimension is %d, the shape of input "
"is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
ksize.size(), framework::make_ddim(ksize));
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
"ShapeError: the size of Attr(ksize) and Attr(strides) in "
"Op(pool) must be equal. "
"But received: Attr(ksize)'s size is %d, Attr(strides)'s "
"size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
ksize.size(), strides.size(), framework::make_ddim(ksize),
framework::make_ddim(strides));
platform::errors::InvalidArgument(
"the dimension of input minus the size of "
"Attr(ksize) must be euqal to 2 in Op(pool). "
"But received: the dimension of input minus the size "
"of Attr(ksize) is %d, the "
"input's dimension is %d, the shape of input "
"is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
ksize.size(), framework::make_ddim(ksize)));
PADDLE_ENFORCE_EQ(
ksize.size(), strides.size(),
platform::errors::InvalidArgument(
"the size of Attr(ksize) and Attr(strides) in "
"Op(pool) must be equal. "
"But received: Attr(ksize)'s size is %d, Attr(strides)'s "
"size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
ksize.size(), strides.size(), framework::make_ddim(ksize),
framework::make_ddim(strides)));
// MKL-DNN Kernels are using NCHW order of dims description
// so we ignore data_format consideration for MKL-DNN kernel
......@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
}
void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::NotFound(
"Input(X) of Pool Gradoperator is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
"Input(X@GRAD) should not be null.");
platform::errors::NotFound(
"Input(X@GRAD) of Pool Gradoperator is not found."));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
......@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used");
platform::errors::InvalidArgument(
"Float16 can only be used when CUDNN is used"));
}
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
library_);
......
......@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
}
} else {
PADDLE_ENFORCE_EQ(
data_dims.size() * 2, paddings->size(),
"Paddings size should be the same or twice as the pooling size.");
PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
platform::errors::InvalidArgument(
"Paddings size %d should be the same or twice as the "
"pooling size %d.",
paddings->size(), data_dims.size() * 2));
}
// when padding_algorithm is "VALID" or "SAME"
......@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
pool_process, exclusive, adaptive, out);
}
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
}
}
};
......@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
adaptive, in_x_grad);
}
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
}
}
}
......
......@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
bool adaptive = ctx->Attrs().Get<bool>("adaptive");
PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
"Pooling intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE(
in_x_dims.size() == 4 || in_x_dims.size() == 5,
platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
"tensor but received %dD-Tensor",
in_x_dims.size()));
if (ctx->Attrs().Get<bool>("global_pooling")) {
ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
......@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
}
}
PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
platform::errors::InvalidArgument(
"Input size and pooling size should be consistent."));
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
platform::errors::InvalidArgument(
"Strides size and pooling size should be the same."));
PADDLE_ENFORCE_EQ(
in_x_dims.size() - ksize.size(), 2U,
platform::errors::InvalidArgument(
"The input size %d minus the kernel size %d should equal to 2.",
in_x_dims.size(), ksize.size()));
PADDLE_ENFORCE_EQ(
ksize.size(), strides.size(),
platform::errors::InvalidArgument(
"Strides size %d and pooling size %d should be the same.",
strides.size(), ksize.size()));
PADDLE_ENFORCE_EQ(
ksize.size(), paddings.size(),
platform::errors::InvalidArgument(
"Paddings size and pooling size should be the same."));
"Paddings size %d and pooling size %d should be the same.",
paddings.size(), ksize.size()));
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
if (adaptive) {
......
......@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
mask);
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
}
}
};
......@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
paddings, adaptive, in_x_grad);
} break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Pool op only supports 2D and 3D input."));
}
}
}
}
......
......@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
int height = in_dims[2];
int width = in_dims[3];
PADDLE_ENFORCE_EQ(input_channels,
output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
PADDLE_ENFORCE_EQ(
input_channels, output_channels * pooled_height * pooled_width,
platform::errors::InvalidArgument(
"The channels %d of input X should equal the product of "
"output_channels %d x pooled_height %d x pooled_width %d.",
input_channels, output_channels, pooled_height, pooled_width));
int rois_num = rois->dims()[0];
if (rois_num == 0) return;
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size,
"The rois_batch_size and input(X) batch_size must be the same.");
PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
platform::errors::InvalidArgument(
"The batch size of input(ROIs) and input(X) must be "
"the same but received batch size of input(ROIs) and "
"input(X) is %d and %d respectively.",
rois_batch_size, batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
platform::errors::InvalidArgument(
"The number of rois from input(ROIs) and its LOD "
"must be the same. Received rois %d of input(ROIs) "
"but the number of rois %d from its LOD is %d",
rois_num, rois_num_with_lod));
// set rois batch id
framework::Tensor rois_batch_id_list;
......
......@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
int rois_batch_size = rois_num_t->numel();
PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same.");
platform::errors::InvalidArgument(
"The batch size of input(ROIs) and input(X) must be the same but "
"received batch size of input(ROIs) and input(X) is %d and %d "
"respectively.",
rois_batch_size, batch_size));
std::vector<int> rois_num_list(rois_batch_size);
memory::Copy(cplace, rois_num_list.data(), gplace,
rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
......@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same.");
platform::errors::InvalidArgument(
"The batch size of input(ROIs) and input(X) must be the same but "
"received batch size of input(ROIs) and input(X) is %d and %d "
"respectively.",
rois_batch_size, batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
platform::errors::InvalidArgument(
"The number of rois from input(ROIs) and its LOD "
"must be the same. Received rois %d of input(ROIs) "
"but the number of rois %d from its LOD is %d",
rois_num, rois_num_with_lod));
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
......
......@@ -103,13 +103,13 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
"Target sequence length for Label when Label is a 2-D tensor.")
.AsDispensable();
AddOutput("WarpCTCGrad",
"(Tensor, default: Tensor<float>), a temporary "
"(Tensor), a temporary "
"output Tensor to store the gradients of warp-ctc, which is "
"computed with loss together in one call. It is a 3-D Tensor of "
"the shape [max_sequence_length, batch_size, num_classes + 1].")
.AsIntermediate();
AddOutput("Loss",
"(Tensor, default: Tensor<float>), the Connectionist "
"(Tensor), the Connectionist "
"Temporal Classification (CTC) loss, which is a 2-D Tensor of "
"the shape [batch_size, 1]");
AddAttr<int>("blank",
......@@ -197,7 +197,9 @@ REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp,
ops::WarpCTCGradOpNoNeedBufferVarInferer);
REGISTER_OP_CPU_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -16,7 +16,9 @@ limitations under the License. */
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -27,7 +27,52 @@ namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class ComputeCtcLossFunctor {
public:
ctcStatus_t operator()(const T* const activations, T* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths, int alphabet_size,
int minibatch, T* costs, void* workspace,
ctcOptions options) {
return CTC_STATUS_EXECUTION_FAILED;
}
};
template <typename DeviceContext>
class ComputeCtcLossFunctor<DeviceContext, float> {
public:
ctcStatus_t operator()(const float* const activations, float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths, int alphabet_size,
int minibatch, float* costs, void* workspace,
ctcOptions options) {
return platform::dynload::compute_ctc_loss(
activations, gradients, flat_labels, label_lengths, input_lengths,
static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
workspace, options);
}
};
template <typename DeviceContext>
class ComputeCtcLossFunctor<DeviceContext, double> {
public:
ctcStatus_t operator()(const double* const activations, double* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths, int alphabet_size,
int minibatch, double* costs, void* workspace,
ctcOptions options) {
return platform::dynload::compute_ctc_loss_double(
activations, gradients, flat_labels, label_lengths, input_lengths,
static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
workspace, options);
}
};
template <typename DeviceContext, typename T>
class WarpCTCFunctor {
public:
/*
......@@ -51,21 +96,29 @@ class WarpCTCFunctor {
* \param blank blank label used in ctc loss function.
* \param cpu_losss cost of each sequence in CPU memory.
*/
void operator()(const framework::ExecutionContext& ctx, const float* input,
float* gradient, const int* cpu_labels,
void operator()(const framework::ExecutionContext& ctx, const T* input,
T* gradient, const int* cpu_labels,
const int* cpu_label_lengths, const int* cpu_input_lengths,
const size_t sequence_width, const size_t num_sequences,
const size_t blank, float* cpu_loss) {
const size_t blank, T* cpu_loss) {
// Init warp-ctc options
init(ctx, blank);
// Compute the required workspace size.
// There is no memory allocated operations within warp-ctc.
size_t workspace_bytes = 0;
ctcStatus_t status = platform::dynload::get_workspace_size(
cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
static_cast<int>(num_sequences), options_, &workspace_bytes);
ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
if (sizeof(T) == 4) {
status = platform::dynload::get_workspace_size(
cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences),
options_, &workspace_bytes);
} else {
status = platform::dynload::get_workspace_size_double(
cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences),
options_, &workspace_bytes);
}
PADDLE_ENFORCE_EQ(
CTC_STATUS_SUCCESS, status,
platform::errors::PreconditionNotMet(
......@@ -79,17 +132,17 @@ class WarpCTCFunctor {
workspace_bytes));
auto& dev_ctx = ctx.template device_context<DeviceContext>();
size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
Tensor workspace = ctx.AllocateTmpTensor<float, DeviceContext>(
size_t workspace_elements = workspace_bytes / sizeof(T) + 1UL;
Tensor workspace = ctx.AllocateTmpTensor<T, DeviceContext>(
framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
dev_ctx);
float* workspace_data = workspace.data<float>();
math::SetConstant<DeviceContext, float>()(
T* workspace_data = workspace.data<T>();
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), &workspace,
static_cast<float>(0));
static_cast<T>(0));
// compute loss and gradient
status = platform::dynload::compute_ctc_loss(
status = ComputeCtcLossFunctor<DeviceContext, T>()(
input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences),
cpu_loss, workspace_data, options_);
......@@ -112,7 +165,8 @@ class WarpCTCFunctor {
ctx.device_context())
.stream();
#else
PADDLE_THROW("[warpctc init] GPU is not enabled.");
PADDLE_THROW(platform::errors::PreconditionNotMet(
"[warpctc init] GPU is not enabled."));
#endif
} else {
options_.loc = CTC_CPU;
......@@ -292,7 +346,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
WarpCTCFunctor<DeviceContext>()(
WarpCTCFunctor<DeviceContext, T>()(
ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
sequence_width, num_sequences, blank, warpctc_loss_data);
......
......@@ -53,7 +53,9 @@ extern void* warpctc_dso_handle;
__macro(get_warpctc_version); \
__macro(ctcGetStatusString); \
__macro(compute_ctc_loss); \
__macro(get_workspace_size)
__macro(compute_ctc_loss_double); \
__macro(get_workspace_size); \
__macro(get_workspace_size_double)
WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/xpu_info.h"
TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
......
......@@ -15,9 +15,36 @@
#pragma once
#ifdef PADDLE_WITH_XPU
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/errors.h"
#include "xpu/api.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
namespace xpu = baidu::xpu::api;
class XPUActHelper {
public:
// Convert string to activation type in xpu
static xpu::Activation_t ConvertToXpuActType(
const std::string& act_type_str) {
static std::unordered_map<std::string, xpu::Activation_t> str2act = {
{"linear", xpu::Activation_t::LINEAR},
{"relu", xpu::Activation_t::RELU},
{"sigmoid", xpu::Activation_t::SIGMOID},
{"tanh", xpu::Activation_t::TANH},
{"gelu", xpu::Activation_t::GELU},
{"leaky_relu", xpu::Activation_t::LEAKY_RELU},
{"sqrt", xpu::Activation_t::SQRT},
{"square", xpu::Activation_t::SQUARE}};
auto res = str2act.find(act_type_str);
PADDLE_ENFORCE_NE(res, str2act.end(),
paddle::platform::errors::InvalidArgument(
"Invalid activation type(%s) in XPU", act_type_str));
return res->second;
}
};
#endif
......@@ -48,6 +48,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"collect_fpn_proposals",
{"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
{"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
{"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
};
// NOTE(zhiqiu): Like op_ins_map.
......
......@@ -230,7 +230,6 @@ from .framework import CPUPlace #DEFINE_ALIAS
from .framework import CUDAPlace #DEFINE_ALIAS
from .framework import CUDAPinnedPlace #DEFINE_ALIAS
from .framework import to_variable #DEFINE_ALIAS
from .framework import grad #DEFINE_ALIAS
from .framework import no_grad #DEFINE_ALIAS
from .framework import save #DEFINE_ALIAS
......@@ -258,6 +257,8 @@ from .tensor.stat import numel #DEFINE_ALIAS
from .device import get_cudnn_version
from .device import set_device
from .device import get_device
from .device import is_compiled_with_xpu
from .device import XPUPlace
# from .tensor.tensor import Tensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS
......
......@@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
__all__ = [
'get_cudnn_version',
'set_device',
'get_device'
'get_device',
'XPUPlace',
'is_compiled_with_xpu'
# 'cpu_places',
# 'CPUPlace',
# 'cuda_pinned_places',
......@@ -35,6 +37,37 @@ __all__ = [
_cudnn_version = None
def is_compiled_with_xpu():
"""
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
Returns (bool): whether paddle was built with WITH_XPU=ON
Examples:
.. code-block:: python
import paddle
support_xpu = paddle.device.is_compiled_with_xpu()
"""
return core.is_compiled_with_xpu()
def XPUPlace(dev_id):
"""
Return a Baidu Kunlun Place
Parameters:
dev_id(int): Baidu Kunlun device id
Examples:
.. code-block:: python
import paddle
place = paddle.device.XPUPlace(0)
"""
return core.XPUPlace(dev_id)
def get_cudnn_version():
"""
This funciton return the version of cudnn. the retuen value is int which represents the
......
......@@ -30,6 +30,7 @@ __all__ = [
]
fleet = Fleet()
_final_strategy = fleet._final_strategy
init = fleet.init
is_first_worker = fleet.is_first_worker
worker_index = fleet.worker_index
......
......@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
if getattr(self.strategy, f.name):
draws += border + "\n"
draws += h1_format.format(
"{} = True, please check {}_configs".format(
f.name, f.name))
"{}=True <-> {}_configs".format(f.name, f.name))
draws += line + "\n"
my_configs = getattr(self.strategy,
f.name + "_configs")
......
......@@ -119,6 +119,8 @@ class Fleet(object):
self.strategy_compiler = None
self._is_collective = False
self._runtime_handle = None
self._util = None
self._context = {}
def init(self, role_maker=None, is_collective=False):
"""
......@@ -233,7 +235,7 @@ class Fleet(object):
Returns:
int: worker numbers
Examples:
.. code-block:: python
......@@ -569,8 +571,9 @@ class Fleet(object):
if strategy == None:
strategy = DistributedStrategy()
self.user_defined_strategy = strategy
self.valid_strategy = None
self._user_defined_strategy = copy.deepcopy(strategy)
self._context = {}
return self
@dygraph_only
......@@ -909,6 +912,15 @@ class Fleet(object):
# imitate target optimizer retrieval
return self.user_defined_optimizer.clear_grad()
def _final_strategy(self):
if "valid_strategy" not in self._context:
print(
"WARNING: You may need to call minimize function before this function is called"
)
return {}
else:
return self._context["valid_strategy"]
def minimize(self,
loss,
startup_program=None,
......@@ -958,12 +970,15 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/FleetX
"""
context = {}
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
if paddle.fluid.framework.in_dygraph_mode():
# imitate target optimizer retrieval
target_opt = self.user_defined_optimizer
self._context = context
return target_opt.minimize(loss)
context = {}
# cache original feed forward program
self.origin_main_program = loss.block.program
context["origin_main_program"] = self.origin_main_program
......@@ -984,17 +999,19 @@ class Fleet(object):
MetaOptimizerFactory()._get_valid_meta_optimizers(
self.user_defined_optimizer)
context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
# trigger the auto-parallel in very strict condition
# strategy = DistributedStrategy()
# strategy.auto = True
# optimizer = paddle.optimizer.SGD(learning_rate=0.1)
# optimizer = fleet.distributed_optimizer(optimizer, strategy)
if self.user_defined_strategy._is_strict_auto():
if copy_user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer
for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy, context)
opt._enable_strategy(copy_user_defined_strategy, context)
valid_optimizer_list = []
valid_graph_optimizer_list = []
......@@ -1003,7 +1020,7 @@ class Fleet(object):
for opt in distributed_optimizer_list:
opt._set_basic_info(loss, self._role_maker,
self.user_defined_optimizer,
self.user_defined_strategy)
copy_user_defined_strategy)
if opt._can_apply() and not opt._is_graph_out():
valid_optimizer_list.append(opt)
elif opt._can_apply() and opt._is_graph_out():
......@@ -1014,13 +1031,15 @@ class Fleet(object):
meta_optimizer, graph_optimizer = \
self.strategy_compiler.generate_optimizer(
loss, self._role_maker, self.user_defined_optimizer,
self.user_defined_strategy, valid_optimizer_list,
copy_user_defined_strategy, valid_optimizer_list,
valid_graph_optimizer_list)
valid_strategy = self.strategy_compiler._get_valid_strategy(
self.user_defined_strategy, can_not_apply_optimizer_list)
copy_user_defined_strategy, can_not_apply_optimizer_list)
context["valid_strategy"] = copy.deepcopy(valid_strategy)
context["valid_strategy"] = valid_strategy
self._context = context
self.valid_strategy = valid_strategy
self.valid_strategy._enable_env()
......
......@@ -495,7 +495,7 @@ class RoleMakerBase(object):
Returns:
string: all heter_trainers'endpoints
"""
assert self._heter_trainer_endpoints != []
assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
return self._heter_trainer_endpoints
def _get_heter_worker_endpoint(self):
......@@ -505,10 +505,10 @@ class RoleMakerBase(object):
e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainer
"""
assert self._heter_trainer_endpoints != []
return self._heter_trainer_endpoints[(self._current_id + 1) %
assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
return self._heter_trainer_endpoints[(self._current_id) %
self._heter_worker_num()]
def _get_heter_worker_device(self):
......
......@@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor
from paddle.fluid.parallel_executor import ParallelExecutor
from .runtime_base import RuntimeBase
from ..base.private_helper_function import wait_server_ready
class ParameterServerRuntime(RuntimeBase):
......@@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase):
return False
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.READER:
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.READER:
return False
return var.persistable
......@@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase):
trainer_config = self.async_strategy.get_trainer_runtime_config()
dist_strategy = self.context["valid_strategy"]
launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
if launch_barrier:
# for trainer wait server ready
wait_server_ready(self.role_maker._get_pserver_endpoints())
# for ps-heter mode, wait heter worker ready
if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
):
wait_server_ready(self.role_maker._get_heter_worker_endpoints())
lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
if lrs:
......@@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase):
opts = _get_optimize_ops(self.origin_main_program)
for op in opts:
if "Param" in op.input_names and \
"LearningRate" in op.input_names and op.input("Param")[0] == param_name:
"LearningRate" in op.input_names and op.input("Param")[0] == param_name:
return op
def _save_dense_params(self, executor, dirname, context, main_program):
......
......@@ -1291,17 +1291,17 @@ def append_backward(loss,
It will be automatically invoked by the optimizer's `minimize` function.
Parameters:
loss( :ref:`api_guide_Variable_en` ): The loss variable of the network.
parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names
loss(Tensor): The loss Tensor of the network.
parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
that need to be updated by optimizers.
If it is None, all parameters
will be updated.
Default: None.
no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with
no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All Tensors with
`stop_gradient=True` from all blocks will
be automatically added into this set.
If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
Default: None.
callbacks(list[callable object], optional): List of callback functions.
The callbacks are used for
......@@ -1312,70 +1312,73 @@ def append_backward(loss,
new gradient operator is added
into the program. The callable
object must have two input
parameters: 'block' and 'context'.
The 'block' is the :ref:`api_guide_Block_en` which
parameters: ``block`` and ``context`` .
The ``block`` is the :ref:`api_guide_Block_en` which
the new gradient operator will
be added to. The 'context' is a
be added to. The ``context`` is a
map, whose keys are gradient
variable names and values are
corresponding original :ref:`api_guide_Variable_en` .
In addition to this, the 'context'
Tensor names and values are
corresponding original :ref:`api_guide_tensor_en` .
In addition to this, the ``context``
has another special key-value pair:
the key is string '__current_op_desc__'
the key is string ``__current_op_desc__``
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
Default: None.
Returns:
list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients.
The key is the parameter and the value is gradient variable.
list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
The key is the parameter and the value is gradient Tensor.
Raises:
AssertionError: If `loss` is not an instance of Variable.
AssertionError: If ``loss`` is not an instance of Tensor.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle
import paddle.nn.functional as F
x = fluid.data(name='x', shape=[None, 13], dtype='int64')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
x_emb = fluid.embedding(x, size=[100, 256])
y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc')
loss = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(loss)
paddle.enable_static()
x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
x_emb = paddle.static.nn.embedding(x, size=[100, 256])
y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
loss = F.square_error_cost(input=y_predict, label=y)
avg_loss = paddle.mean(loss)
# Get all weights in main_program, not include bias.
all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name]
all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
all_weights_name = [w.name for w in all_weights]
# return all param_grads needed to be updated if parameter_list set default None.
p_g_list1 = fluid.backward.append_backward(loss=avg_loss)
p_g_list1 = paddle.static.append_backward(loss=avg_loss)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
# return the param_grads corresponding to parameter_list that can be list of param (Variable).
p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights)
# return the param_grads corresponding to parameter_list that can be list of param (Tensor).
p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# parameter_list can be list of param.name (str).
p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name)
p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# no_grad_set can be set of Variables that means grad will be cut off from these Variables.
p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
# no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
# output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
# no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly.
p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
# no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
# output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
# return [] because all param_grads are filtered by no_grad_set.
p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
"""
check_type(loss, 'loss', framework.Variable,
'fluid.backward.append_backward')
'paddle.static.append_backward')
if loss.op is None:
# the loss is from a cloned program. Find loss op manually.
......@@ -1387,7 +1390,7 @@ def append_backward(loss,
if callbacks is not None:
check_type(callbacks, 'callbacks', list,
'fluid.backward.append_backward')
'paddle.static.append_backward')
program = loss.block.program
root_block = program.block(0)
......@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
Backpropagate the gradients of targets to inputs.
Args:
targets(Variable|list[Variable]): The target variables
inputs(Variable|list[Variable]): The input variables
target_gradients (Variable|list[Variable], optional): The gradient variables
targets(Tensor|list[Tensor]): The target Tensors
inputs(Tensor|list[Tensor]): The input Tensors
target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with
no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All Tensors with
`stop_gradient=True` from all blocks will
be automatically added into this set.
If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
Default: None.
Return:
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
(list[Tensor]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient Tensor
will be None
"""
targets = _as_list(targets)
......@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
Backpropagate the gradients of targets to inputs.
Args:
targets (Variable|list[Variable]): The target variables.
inputs (Variable|list[Variable]): The input variables.
target_gradients (Variable|list[Variable], optional): The gradient variables
targets (Tensor|list[Tensor]): The target Tensors.
inputs (Tensor|list[Tensor]): The input Tensors.
target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All variables with `stop_gradient=True` from all blocks will
be automatically added into this set. If this parameter is not None, the Variables or Variable.names
no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
in this set will be added to the default set. Default: None.
Return:
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
(list[Tensor]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient Tensor
will be None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle
import paddle.nn.functional as F
paddle.enable_static()
x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32')
x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
x.stop_gradient=False
y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
z = fluid.gradients([y], x)
print(z)
y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
y = F.relu(y)
z = paddle.static.gradients([y], x)
print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
"""
check_type(targets, 'targets', (framework.Variable, list),
'fluid.backward.gradients')
'paddle.static.gradients')
check_type(inputs, 'inputs', (framework.Variable, list),
'fluid.backward.gradients')
'paddle.static.gradients')
check_type(target_gradients, 'target_gradients', (
framework.Variable, list, type(None)), 'fluid.backward.gradients')
framework.Variable, list, type(None)), 'paddle.static.gradients')
outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
return _as_list(outs)
......@@ -3230,14 +3230,11 @@ class Flatten(layers.Layer):
.. code-block:: python
import paddle
from paddle import to_variable
import numpy as np
paddle.disable_static()
inp_np = np.ones([5, 2, 3, 4]).astype('float32')
paddle.disable_static()
inp_np = to_variable(inp_np)
inp_np = paddle.to_tensor(inp_np)
flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
flatten_res = flatten(inp_np)
......
......@@ -5396,13 +5396,13 @@ def program_guard(main_program, startup_program=None):
"""
:api_attr: Static Graph
Change the global main program and startup program with `"with"` statement.
Layer functions in the Python `"with"` block will append operators and
variables to the new main programs.
Change the global main program and startup program with ``with`` statement.
Layer functions in the Python ``with`` block will append operators and
Tensors to the new main programs.
Args:
main_program(Program): New main program inside `"with"` statement.
startup_program(Program, optional): New startup program inside `"with"`
main_program(Program): New main program inside ``with`` statement.
startup_program(Program, optional): New startup program inside ``with``
statement. :code:`None` means not changing startup program,
default_startup_program is still used.
Default: None.
......@@ -5410,13 +5410,14 @@ def program_guard(main_program, startup_program=None):
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
hidden = fluid.layers.fc(input=data, size=10, act='relu')
paddle.enable_static()
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
Notes: The temporary :code:`Program` can be used if the user does not need
to construct either of startup program or main program.
......@@ -5424,20 +5425,22 @@ def program_guard(main_program, startup_program=None):
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle
main_program = fluid.Program()
# does not care about startup program. Just pass a temporary value.
with fluid.program_guard(main_program, fluid.Program()):
data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
paddle.enable_static()
main_program = paddle.static.Program()
# does not care about startup program. Just pass a temporary value.
with paddle.static.program_guard(main_program, paddle.static.Program()):
data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
"""
from .data_feeder import check_type
check_type(main_program, 'main_program', Program, 'fluid.program_guard')
check_type(main_program, 'main_program', Program,
'paddle.static.program_guard')
main_program = switch_main_program(main_program)
if startup_program is not None:
check_type(startup_program, 'startup_program', Program,
'fluid.program_guard')
'paddle.static.program_guard')
startup_program = switch_startup_program(startup_program)
try:
yield
......
......@@ -2488,9 +2488,6 @@ def _error_message(what, arg_name, op_name, right_value, error_value):
def case(pred_fn_pairs, default=None, name=None):
'''
:api_attr: Static Graph
:alias_main: paddle.nn.case
:alias: paddle.nn.case,paddle.nn.control_flow.case
:old_api: paddle.fluid.layers.case
This operator works like an if-elif-elif-else chain.
......@@ -2500,7 +2497,7 @@ def case(pred_fn_pairs, default=None, name=None):
name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Variable|list(Variable): Tensors returned by the callable from the first pair whose pred is True,
Tensor|list(Tensor): Tensors returned by the callable from the first pair whose pred is True,
or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None,
or Tensors returned by the last callable in ``pred_fn_pairs`` if no pred in ``pred_fn_pairs`` is True and ``default`` is None.
......@@ -2508,45 +2505,47 @@ def case(pred_fn_pairs, default=None, name=None):
TypeError: If the type of ``pred_fn_pairs`` is not list or tuple.
TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple.
TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2.
TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not Variable.
TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not a Tensor.
TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable.
TypeError: If ``default`` is not None but it is not callable.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle
paddle.enable_static()
def fn_1():
return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
def fn_2():
return layers.fill_constant(shape=[2, 2], dtype='int32', value=2)
return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
def fn_3():
return layers.fill_constant(shape=[3], dtype='int32', value=3)
return paddle.fill_constant(shape=[3], dtype='int32', value=3)
main_program = fluid.default_startup_program()
startup_program = fluid.default_main_program()
with fluid.program_guard(main_program, startup_program):
x = layers.fill_constant(shape=[1], dtype='float32', value=0.3)
y = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
z = layers.fill_constant(shape=[1], dtype='float32', value=0.2)
main_program = paddle.static.default_startup_program()
startup_program = paddle.static.default_main_program()
with paddle.static.program_guard(main_program, startup_program):
x = paddle.fill_constant(shape=[1], dtype='float32', value=0.3)
y = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
z = paddle.fill_constant(shape=[1], dtype='float32', value=0.2)
pred_1 = layers.less_than(z, x) # true: 0.2 < 0.3
pred_2 = layers.less_than(x, y) # false: 0.3 < 0.1
pred_3 = layers.equal(x, y) # false: 0.3 == 0.1
pred_1 = paddle.less_than(z, x) # true: 0.2 < 0.3
pred_2 = paddle.less_than(x, y) # false: 0.3 < 0.1
pred_3 = paddle.equal(x, y) # false: 0.3 == 0.1
# Call fn_1 because pred_1 is True
out_1 = layers.case(
out_1 = paddle.static.nn.case(
pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
# Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called.
# because fn_3 is the last callable in pred_fn_pairs.
out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
out_2 = paddle.static.nn.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
exe = fluid.Executor(fluid.CPUPlace())
exe = paddle.static.Executor(paddle.CPUPlace())
res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2])
print(res_1) # [[1. 1.]]
print(res_2) # [3 3 3]
......@@ -3610,18 +3609,18 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
This operator is like a C++ switch/case statement.
Args:
branch_index(Variable): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
default(callable, optional): Callable that returns a structure of Tensors.
name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Variable|list(Variable): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
Tensor|list(Tensor): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``,
or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``.
Raises:
TypeError: If the type of ``branch_index`` is not Variable.
TypeError: If the type of ``branch_index`` is not Tensor.
TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``.
TypeError: If the type of ``branch_fns`` is not dict, list or tuple.
TypeError: If the elements of ``branch_fns`` is not 2-tuple.
......@@ -3633,40 +3632,41 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle
paddle.enable_static()
def fn_1():
return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
def fn_2():
return layers.fill_constant(shape=[2, 2], dtype='int32', value=2)
return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
def fn_3():
return layers.fill_constant(shape=[3], dtype='int32', value=3)
return paddle.fill_constant(shape=[3], dtype='int32', value=3)
main_program = fluid.default_startup_program()
startup_program = fluid.default_main_program()
with fluid.program_guard(main_program, startup_program):
index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
main_program = paddle.static.default_startup_program()
startup_program = paddle.static.default_main_program()
with paddle.static.program_guard(main_program, startup_program):
index_1 = paddle.fill_constant(shape=[1], dtype='int32', value=1)
index_2 = paddle.fill_constant(shape=[1], dtype='int32', value=2)
out_1 = layers.switch_case(
out_1 = paddle.static.nn.switch_case(
branch_index=index_1,
branch_fns={1: fn_1, 2: fn_2},
default=fn_3)
out_2 = layers.switch_case(
out_2 = paddle.static.nn.switch_case(
branch_index=index_2,
branch_fns=[(1, fn_1), (2, fn_2)],
default=fn_3)
# Argument default is None and no index matches. fn_3 will be called because of the max index 7.
out_3 = layers.switch_case(
out_3 = paddle.static.nn.switch_case(
branch_index=index_2,
branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
exe = fluid.Executor(fluid.CPUPlace())
exe = paddle.static.Executor(paddle.CPUPlace())
res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
print(res_1) # [[1. 1.]]
print(res_2) # [[2 2] [2 2]]
......
......@@ -541,7 +541,7 @@ def warpctc(input,
(not including the blank label). When it is a 3-D Tensor, its shape
is `[max_logit_length, batch_size, num_classes + 1]`,
where `max_logit_length` is the longest length of
input logit sequence. The data type must be float32.
input logit sequence. The data type should be float32 or float64.
label (Variable): The ground truth of variable-length sequence,
which must be a 2-D Tensor with LoD information or a 3-D Tensor without
LoD information, needs to be consistent with the coressponding input.
......@@ -571,6 +571,7 @@ def warpctc(input,
.. code-block:: python
# using LoDTensor
import paddle
import paddle.fluid as fluid
import numpy as np
......@@ -581,6 +582,7 @@ def warpctc(input,
# class num
class_num = 5
paddle.enable_static()
logits = fluid.data(name='logits',shape=[None, class_num+1],
dtype='float32',lod_level=1)
label = fluid.data(name='label', shape=[None, 1],
......@@ -602,6 +604,7 @@ def warpctc(input,
.. code-block:: python
# using Tensor
import paddle
import paddle.fluid as fluid
import numpy as np
......@@ -613,6 +616,7 @@ def warpctc(input,
batch_size = 16
# class num
class_num = 5
paddle.enable_static()
logits = fluid.data(name='logits',
shape=[max_seq_length, batch_size, class_num+1],
dtype='float32')
......@@ -637,8 +641,23 @@ def warpctc(input,
fetch_list=[cost.name])
print(output)
"""
if in_dygraph_mode():
if input_length is None or label_length is None:
raise ValueError(
"input_length and label_length must not be None in dygraph mode!"
)
grad, loss_out = core.ops.warpctc(
input,
label,
input_length,
label_length,
'blank',
blank,
'norm_by_times',
norm_by_times, )
return loss_out
helper = LayerHelper('warpctc', **locals())
check_variable_and_dtype(input, 'input', ['float32'], "warpctc")
check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
this_inputs = {'Logits': [input], 'Label': [label]}
if input_length is not None and label_length is not None:
......
......@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
py_test_modules(test_warpctc_op MODULES test_warpctc_op)
# disable test_warpctc_op
# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
......@@ -531,15 +532,15 @@ if(NOT WIN32)
endif()
if(NOT APPLE AND NOT WIN32)
bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
endif()
add_subdirectory(sequence)
......
......@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
return line
def prepare_fake_data(file_nums=9, file_lines=1000):
def prepare_fake_data(file_nums=6, file_lines=1000):
"""
Create fake data with same type as avazu_ctr_data
"""
......
......@@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
debug=int(os.getenv("Debug", "0")))
pass_time = time.time() - pass_start
print("do_dataset_training done. using time {}".format(pass_time))
if os.getenv("SAVE_MODEL") == "1":
model_dir = tempfile.mkdtemp()
fleet.save_inference_model(exe, model_dir,
[feed.name for feed in self.feeds],
self.avg_cost)
self.check_model_right(model_dir)
shutil.rmtree(model_dir)
fleet.stop_worker()
print("do_dataset_training stop worker.")
......
......@@ -217,6 +217,9 @@ class OpTest(unittest.TestCase):
return False
return True
def is_xpu_op_test():
return hasattr(cls, "use_xpu") and cls.use_xpu == True
def is_mkldnn_op_test():
return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
......@@ -239,6 +242,7 @@ class OpTest(unittest.TestCase):
if cls.dtype in [np.float32, np.float64] \
and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
and not hasattr(cls, 'exist_fp64_check_grad') \
and not is_xpu_op_test() \
and not is_mkldnn_op_test():
raise AssertionError(
"This test of %s op needs check_grad with fp64 precision." %
......@@ -336,6 +340,11 @@ class OpTest(unittest.TestCase):
self.attrs["use_mkldnn"] == True):
self.__class__.use_mkldnn = True
if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
(hasattr(self, "attrs") and "use_xpu" in self.attrs and \
self.attrs["use_xpu"] == True):
self.__class__.use_xpu = True
op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
"infer datatype from inputs and outputs for this test case"
self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
......@@ -932,6 +941,8 @@ class OpTest(unittest.TestCase):
need_run_ops = self._get_need_run_ops(op_desc)
res = {}
if hasattr(self, 'attrs') and bool(self.attrs.get('use_xpu', False)):
return
for op_desc, father_op_desc in reversed(need_run_ops):
# The first one is the forward op
has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
......@@ -1203,6 +1214,11 @@ class OpTest(unittest.TestCase):
self.attrs["use_mkldnn"] == True):
self.__class__.use_mkldnn = True
if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
(hasattr(self, "attrs") and "use_xpu" in self.attrs and \
self.attrs["use_xpu"] == True):
self.__class__.use_xpu = True
places = self._get_places()
for place in places:
res = self.check_output_with_place(place, atol, no_check_set,
......
......@@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
class TestELUDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
shape = [2, 3, 7, 9]
shape = [2, 3, 6, 6]
eps = 1e-6
alpha = 1.1
dtype = np.float64
SEED = 0
x = layers.data('x', shape, False, dtype)
x.persistable = True
y = layers.elu(x, alpha=alpha)
np.random.RandomState(SEED)
x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps)
......@@ -171,5 +173,29 @@ class TestAbsDoubleGradCheck(unittest.TestCase):
self.func(p)
class TestLogDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
shape = [2, 3, 7, 9]
eps = 1e-6
dtype = np.float64
x = layers.data('x', shape, False, dtype)
x.persistable = True
y = layers.log(x)
x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
if __name__ == "__main__":
unittest.main()
......@@ -238,7 +238,7 @@ class TestTanhAPI(unittest.TestCase):
def test_dygraph_api(self):
paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out1 = F.tanh(x)
out2 = paddle.tanh(x)
th = paddle.nn.Tanh()
......@@ -596,7 +596,7 @@ class TestHardShrinkAPI(unittest.TestCase):
def test_dygraph_api(self):
paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out1 = F.hardshrink(x)
hd = paddle.nn.Hardshrink()
out2 = hd(x)
......@@ -666,7 +666,7 @@ class TestHardtanhAPI(unittest.TestCase):
def test_dygraph_api(self):
paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out1 = F.hardtanh(x)
m = paddle.nn.Hardtanh()
out2 = m(x)
......@@ -1112,7 +1112,7 @@ class TestLeakyReluAPI(unittest.TestCase):
def test_dygraph_api(self):
paddle.disable_static(self.place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out1 = F.leaky_relu(x)
m = paddle.nn.LeakyReLU()
out2 = m(x)
......
......@@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase):
def test_adamax_api_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.Adamax(
learning_rate=0.01,
......
......@@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase):
def test_adamw_op_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.AdamW(
learning_rate=0.01,
......@@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase):
def test_adamw_op_coverage(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.AdamW(
learning_rate=0.0,
......
......@@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_avg_pool2d(
x=x, output_size=[3, 3])
......@@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
out_1 = adaptive_avg_pool(x=x)
......
......@@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_avg_pool3d(
x=x, output_size=[3, 3, 3])
......@@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
output_size=[3, 3, 3])
......
......@@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_max_pool2d(
x=x, return_indices=False, output_size=[3, 3])
......@@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
out_1 = adaptive_max_pool(x=x)
......
......@@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
out_1 = paddle.nn.functional.adaptive_max_pool3d(
x=x, output_size=[3, 3, 3])
......@@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
if core.is_compiled_with_cuda() else [False]):
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
paddle.disable_static(place=place)
x = paddle.to_variable(self.x_np)
x = paddle.to_tensor(self.x_np)
adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
output_size=[3, 3, 3])
......
......@@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase):
def test_error1():
data_x_wrong = np.ones((2, 3)).astype(np.float32)
x = paddle.to_variable(data_x_wrong)
y = paddle.to_variable(data_y)
input = paddle.to_variable(data_input)
x = paddle.to_tensor(data_x_wrong)
y = paddle.to_tensor(data_y)
input = paddle.to_tensor(data_input)
out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
self.assertRaises(ValueError, test_error1)
'''
......
......@@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase):
x2 = paddle.tensor.arange(5)
x3 = paddle.tensor.creation.arange(5)
start = paddle.to_variable(np.array([0], 'float32'))
end = paddle.to_variable(np.array([5], 'float32'))
step = paddle.to_variable(np.array([1], 'float32'))
start = paddle.to_tensor(np.array([0], 'float32'))
end = paddle.to_tensor(np.array([5], 'float32'))
step = paddle.to_tensor(np.array([1], 'float32'))
x4 = paddle.arange(start, end, step, 'int64')
paddle.enable_static()
......
......@@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase):
a = np.random.rand(3, 3)
a_t = np.transpose(a, [1, 0])
x_data = np.matmul(a, a_t) + 1e-03
x = paddle.to_variable(x_data)
x = paddle.to_tensor(x_data)
out = paddle.cholesky(x, upper=False)
......
......@@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase):
paddle.disable_static(place)
data_shape = [1, 9, 9, 4]
data = np.random.random(data_shape).astype('float32')
images = paddle.to_variable(data, dtype='float32')
v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))
images = paddle.to_tensor(data, dtype='float32')
v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
out_1 = paddle.clip(images, min=0.2, max=0.8)
out_2 = paddle.clip(images, min=0.2, max=0.9)
......
......@@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
strategy.a_sync_configs = {"k_steps": 100}
strategy.a_sync_configs = {"launch_barrier": False}
if training_role == "TRAINER":
self.run_trainer(role, strategy)
......
......@@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
......
......@@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase):
in2 = np.array([[11, 12, 13], [14, 15, 16]])
in3 = np.array([[21, 22], [23, 24]])
paddle.disable_static()
x1 = paddle.to_variable(in1)
x2 = paddle.to_variable(in2)
x3 = paddle.to_variable(in3)
x1 = paddle.to_tensor(in1)
x2 = paddle.to_tensor(in2)
x3 = paddle.to_tensor(in3)
out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
out2 = paddle.concat(x=[x1, x2], axis=0)
np_out1 = np.concatenate([in1, in2, in3], axis=-1)
......
......@@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_x2 = np.random.rand(*shape).astype(np.float32)
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1)
tesnor_x2 = paddle.to_variable(np_x2)
tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_tensor(np_x2)
y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
self.assertTrue(np.allclose(y.numpy(), np_out))
......@@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_x2 = np.random.rand(*shape).astype(np.float32)
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1)
tesnor_x2 = paddle.to_variable(np_x2)
tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_tensor(np_x2)
y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
self.assertTrue(np.allclose(y.numpy(), np_out))
......@@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_x2 = np.random.rand(*shape2).astype(np.float32)
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1)
tesnor_x2 = paddle.to_variable(np_x2)
tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_tensor(np_x2)
y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
self.assertTrue(np.allclose(y.numpy(), np_out))
......@@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
tesnor_x1 = paddle.to_variable(np_x1)
tesnor_x2 = paddle.to_variable(np_x2)
tesnor_x1 = paddle.to_tensor(np_x1)
tesnor_x2 = paddle.to_tensor(np_x2)
y = cos_sim_func(tesnor_x1, tesnor_x2)
self.assertTrue(np.allclose(y.numpy(), np_out))
......
......@@ -21,13 +21,12 @@ import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
from paddle import to_variable
class TestCumsumOp(unittest.TestCase):
def run_cases(self):
data_np = np.arange(12).reshape(3, 4)
data = to_variable(data_np)
data = paddle.to_tensor(data_np)
y = paddle.cumsum(data)
z = np.cumsum(data_np)
......
......@@ -20,7 +20,6 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear
import paddle.fluid.core as core
from paddle import to_variable
class TestDefaultType(unittest.TestCase):
......
......@@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase):
def test_new_directory(self):
new_directory = [
'paddle.enable_static', 'paddle.disable_static',
'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad',
'paddle.no_grad', 'paddle.save', 'paddle.load',
'paddle.static.save', 'paddle.static.load',
'paddle.distributed.ParallelEnv',
......
......@@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(fleet._final_strategy().a_sync)
a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
......
......@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(fleet._final_strategy().a_sync)
a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
......
......@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(fleet._final_strategy().a_sync)
a_sync_configs = fleet._final_strategy().a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 800)
......
......@@ -44,6 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
strategy.a_sync_configs = {"launch_barrier": False}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -312,9 +312,6 @@ class TestFleetBase(unittest.TestCase):
"========================Error tr1_err end==========================="
)
self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
# close trainer file
tr0_pipe.close()
tr1_pipe.close()
......@@ -325,6 +322,8 @@ class TestFleetBase(unittest.TestCase):
ps1.terminate()
shutil.rmtree(gloo_path)
self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
return 0, 0
def check_with_place(self,
......
......@@ -81,7 +81,7 @@ class FleetDistHeterRunnerBase(object):
def build_strategy(self, args):
self.strategy = paddle.distributed.fleet.DistributedStrategy()
self.strategy.a_sync = True
self.strategy.a_sync_configs = {"launch_barrier": True}
return self.strategy
def build_optimizer(self, avg_cost, strategy):
......@@ -237,7 +237,10 @@ class TestFleetHeterBase(unittest.TestCase):
return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe
def _run_cluster(self, model, envs):
env = {'GRAD_CLIP': str(self._grad_clip_mode)}
env = {
'GRAD_CLIP': str(self._grad_clip_mode),
'FLAGS_eager_delete_tensor_gb': str(-1)
}
python_path = self._python_interp
gloo_path = tempfile.mkdtemp()
......@@ -286,27 +289,6 @@ class TestFleetHeterBase(unittest.TestCase):
tr0_ret = tr0.returncode
tr1_ret = tr0.returncode
print("tr get returncode: {}".format(tr0_ret))
if tr0_ret != 0:
print(
"========================Error tr0_err begin==========================="
)
os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
print(
"========================Error tr0_err end==========================="
)
if tr1_ret != 0:
print(
"========================Error tr1_err begin==========================="
)
os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
print(
"========================Error tr1_err end==========================="
)
self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
# close trainer file
tr0_pipe.close()
......@@ -320,7 +302,8 @@ class TestFleetHeterBase(unittest.TestCase):
ps1.terminate()
heter0.terminate()
heter1.terminate()
self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
shutil.rmtree(gloo_path)
return 0, 0
......
......@@ -23,38 +23,6 @@ import paddle
paddle.enable_static()
class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
def _setup_config(self):
self._mode = "async"
self._reader = "dataset"
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_rpc_deadline": "5000", # 5sec to fail fast
"http_proxy": "",
"CPU_NUM": "3"
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "3"
required_envs["GLOG_logtostderr"] = "1"
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self):
self.check_with_place(
"dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
def _setup_config(self):
self._mode = "async"
......
......@@ -195,7 +195,7 @@ class TestFlattenPython(unittest.TestCase):
def test_Negative():
paddle.disable_static()
img = paddle.to_variable(x)
img = paddle.to_tensor(x)
out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
return out.numpy().shape
......
......@@ -18,6 +18,8 @@ import unittest
import paddle
import os
paddle.enable_static()
class TestFleetAMPOptimizer(unittest.TestCase):
def setUp(self):
......@@ -55,6 +57,8 @@ class TestFleetAMPOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
strategy = fleet._final_strategy()
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)
......
......@@ -18,6 +18,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
paddle.enable_static()
class TestDistributedStrategyAuto(unittest.TestCase):
def setUp(self):
......
......@@ -167,6 +167,8 @@ class TestFleetDygraph(unittest.TestCase):
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
final_strategy = fleet._final_strategy()
class TestFleetBaseSingleRunCollective(unittest.TestCase):
def setUp(self):
......
......@@ -19,6 +19,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
paddle.enable_static()
class TestFleetLambMetaOptimizer(unittest.TestCase):
def setUp(self):
......
......@@ -19,6 +19,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
paddle.enable_static()
class TestFleetLarsMetaOptimizer(unittest.TestCase):
def setUp(self):
......
......@@ -211,7 +211,7 @@ class TestImperative(unittest.TestCase):
paddle.disable_static()
self.assertTrue(paddle.in_dynamic_mode())
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
var_inp = paddle.to_variable(np_inp)
var_inp = paddle.to_tensor(np_inp)
mlp = MLP(input_size=2)
out = mlp(var_inp)
dy_out1 = out.numpy()
......@@ -221,7 +221,7 @@ class TestImperative(unittest.TestCase):
self.assertFalse(paddle.in_dynamic_mode())
paddle.disable_static()
self.assertTrue(paddle.in_dynamic_mode())
var_inp = paddle.to_variable(np_inp)
var_inp = paddle.to_tensor(np_inp)
mlp = MLP(input_size=2)
out = mlp(var_inp)
dy_out2 = out.numpy()
......
......@@ -54,7 +54,7 @@ class TestSimpleNet(unittest.TestCase):
# grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = paddle.to_variable(input_word)
input = paddle.to_tensor(input_word)
simplenet = SimpleNet(20, 32, dtype)
adam = SGDOptimizer(
......
......@@ -41,7 +41,7 @@ def run_dygraph(x_np, op_str, use_gpu=True):
if use_gpu and fluid.core.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
paddle.disable_static(place)
x = paddle.to_variable(x_np)
x = paddle.to_tensor(x_np)
dygraph_result = getattr(paddle.tensor, op_str)(x)
return dygraph_result
......
......@@ -543,9 +543,9 @@ class TestJitSaveMultiCases(unittest.TestCase):
loaded_layer = paddle.jit.load(model_path)
loaded_layer.eval()
# inference & compare
x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
x = paddle.to_tensor(np.random.random((1, 784)).astype('float32'))
if with_label:
y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
y = paddle.to_tensor(np.random.random((1, 1)).astype('int64'))
pred, _ = layer(x, y)
pred = pred.numpy()
else:
......@@ -677,7 +677,7 @@ class TestJitSaveMultiCases(unittest.TestCase):
model_path = "test_not_prune_output_spec_name_warning"
configs = paddle.SaveLoadConfig()
out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
configs.output_spec = [out]
paddle.jit.save(layer, model_path, configs=configs)
......@@ -709,7 +709,7 @@ class TestJitSaveMultiCases(unittest.TestCase):
model_path = "test_prune_to_static_after_train"
configs = paddle.SaveLoadConfig()
out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
configs.output_spec = [out]
with self.assertRaises(ValueError):
paddle.jit.save(
......@@ -730,7 +730,7 @@ class TestJitSaveLoadEmptyLayer(unittest.TestCase):
def test_save_load_empty_layer(self):
layer = EmptyLayer()
x = paddle.to_variable(np.random.random((10)).astype('float32'))
x = paddle.to_tensor(np.random.random((10)).astype('float32'))
out = layer(x)
paddle.jit.save(layer, self.model_path)
load_layer = paddle.jit.load(self.model_path)
......@@ -746,8 +746,8 @@ class TestJitSaveLoadNoParamLayer(unittest.TestCase):
def test_save_load_no_param_layer(self):
layer = NoParamLayer()
x = paddle.to_variable(np.random.random((5)).astype('float32'))
y = paddle.to_variable(np.random.random((5)).astype('float32'))
x = paddle.to_tensor(np.random.random((5)).astype('float32'))
y = paddle.to_tensor(np.random.random((5)).astype('float32'))
out = layer(x, y)
paddle.jit.save(layer, self.model_path)
load_layer = paddle.jit.load(self.model_path)
......
......@@ -90,7 +90,7 @@ class TestKLDivLossDygraph(unittest.TestCase):
with paddle.fluid.dygraph.guard():
kldiv_criterion = paddle.nn.KLDivLoss(reduction)
pred_loss = kldiv_criterion(
paddle.to_variable(x), paddle.to_variable(target))
paddle.to_tensor(x), paddle.to_tensor(target))
self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
def test_kl_loss_batchmean(self):
......
......@@ -26,8 +26,8 @@ class TestFunctionalL1Loss(unittest.TestCase):
self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
def run_imperative(self):
input = paddle.to_variable(self.input_np)
label = paddle.to_variable(self.label_np)
input = paddle.to_tensor(self.input_np)
label = paddle.to_tensor(self.label_np)
dy_result = paddle.nn.functional.l1_loss(input, label)
expected = np.mean(np.abs(self.input_np - self.label_np))
self.assertTrue(np.allclose(dy_result.numpy(), expected))
......@@ -106,8 +106,8 @@ class TestClassL1Loss(unittest.TestCase):
self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
def run_imperative(self):
input = paddle.to_variable(self.input_np)
label = paddle.to_variable(self.label_np)
input = paddle.to_tensor(self.input_np)
label = paddle.to_tensor(self.label_np)
l1_loss = paddle.nn.loss.L1Loss()
dy_result = l1_loss(input, label)
expected = np.mean(np.abs(self.input_np - self.label_np))
......
......@@ -96,7 +96,7 @@ class TestNNLogSoftmaxAPI(unittest.TestCase):
# test dygrapg api
paddle.disable_static()
x = paddle.to_variable(self.x)
x = paddle.to_tensor(self.x)
y = logsoftmax(x)
self.assertTrue(np.allclose(y.numpy(), ref_out))
paddle.enable_static()
......@@ -127,7 +127,7 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
self.assertTrue(np.allclose(out[0], ref_out))
paddle.disable_static()
x = paddle.to_variable(self.x)
x = paddle.to_tensor(self.x)
y = F.log_softmax(x, axis, dtype)
self.assertTrue(np.allclose(y.numpy(), ref_out), True)
paddle.enable_static()
......
......@@ -111,7 +111,7 @@ class TestLogsumexpAPI(unittest.TestCase):
self.assertTrue(np.allclose(res[0], out_ref))
paddle.disable_static(self.place)
x = paddle.to_variable(self.x)
x = paddle.to_tensor(self.x)
out = paddle.logsumexp(x, axis, keepdim)
self.assertTrue(np.allclose(out.numpy(), out_ref))
paddle.enable_static()
......@@ -126,7 +126,7 @@ class TestLogsumexpAPI(unittest.TestCase):
def test_alias(self):
paddle.disable_static(self.place)
x = paddle.to_variable(self.x)
x = paddle.to_tensor(self.x)
out1 = paddle.logsumexp(x)
out2 = paddle.tensor.logsumexp(x)
out3 = paddle.tensor.math.logsumexp(x)
......
......@@ -341,10 +341,12 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
self.assertTrue(
np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
m = paddle.to_tensor(np.random.uniform(1, 2, [3, 3]), 'float32')
m = m.matmul(m.t())
d = paddle.to_tensor([[1.2285208, 1.3491015, 1.4899898],
[1.30058, 1.0688717, 1.4928783],
[1.0958099, 1.3724753, 1.8926544]])
d = d.matmul(d.t())
self.assertTrue(
np.array_equal(m.cholesky().numpy(), paddle.cholesky(m).numpy()))
np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
self.assertTrue(
np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import paddle.fluid.core as core
import unittest
import numpy as np
from op_test import OpTest
......
......@@ -80,7 +80,7 @@ class ApiMaxTest(unittest.TestCase):
def test_imperative_api(self):
paddle.disable_static()
np_x = np.array([10, 10]).astype('float64')
x = paddle.to_variable(np_x)
x = paddle.to_tensor(np_x)
z = paddle.max(x, axis=0)
np_z = z.numpy()
z_expected = np.array(np.max(np_x, axis=0))
......
......@@ -61,8 +61,8 @@ class ApiMaximumTest(unittest.TestCase):
def test_dynamic_api(self):
paddle.disable_static()
np_x = np.array([10, 10]).astype('float64')
x = paddle.to_variable(self.input_x)
y = paddle.to_variable(self.input_y)
x = paddle.to_tensor(self.input_x)
y = paddle.to_tensor(self.input_y)
z = paddle.maximum(x, y)
np_z = z.numpy()
z_expected = np.array(np.maximum(self.input_x, self.input_y))
......@@ -73,8 +73,8 @@ class ApiMaximumTest(unittest.TestCase):
np_x = np.random.rand(5, 4, 3, 2).astype("float64")
np_y = np.random.rand(4, 3).astype("float64")
x = paddle.to_variable(self.input_x)
y = paddle.to_variable(self.input_y)
x = paddle.to_tensor(self.input_x)
y = paddle.to_tensor(self.input_y)
result_1 = paddle.maximum(x, y, axis=1)
result_2 = paddle.maximum(x, y, axis=-2)
self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
......@@ -204,7 +204,7 @@ class TestMeanAPI(unittest.TestCase):
paddle.disable_static(self.place)
def test_case(x, axis=None, keepdim=False):
x_tensor = paddle.to_variable(x)
x_tensor = paddle.to_tensor(x)
out = paddle.mean(x_tensor, axis, keepdim)
if isinstance(axis, list):
axis = tuple(axis)
......
......@@ -80,7 +80,7 @@ class ApiMinTest(unittest.TestCase):
def test_imperative_api(self):
paddle.disable_static()
np_x = np.array([10, 10]).astype('float64')
x = paddle.to_variable(np_x)
x = paddle.to_tensor(np_x)
z = paddle.min(x, axis=0)
np_z = z.numpy()
z_expected = np.array(np.min(np_x, axis=0))
......
......@@ -18,6 +18,8 @@ import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import sys
sys.path.append("..")
from op_test import OpTest
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
......@@ -175,57 +177,5 @@ class TestFP16MulOp2(TestMulOp2):
no_grad_set=set('Y'))
@unittest.skipIf(not core.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUMulOp1(TestMulOp):
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
place = core.XPUPlace(0)
self.check_output_with_place(place, atol=1e-1)
def test_check_grad_normal(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.5)
def test_check_grad_ingore_x(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
@unittest.skipIf(not core.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUMulOp2(TestMulOp2):
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
place = core.XPUPlace(0)
self.check_output_with_place(place, atol=2e-1)
def test_check_grad_normal(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.9)
def test_check_grad_ingore_x(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = core.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
if __name__ == "__main__":
unittest.main()
......@@ -20,6 +20,7 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core
from paddle.static import program_guard, Program
from op_test import OpTest
......@@ -37,7 +38,7 @@ class TestMVOp(OpTest):
self.check_grad(['X', 'Vec'], 'Out')
def init_config(self):
self.x = np.random.random((5, 100)).astype("float64")
self.x = np.random.random((2, 100)).astype("float64")
self.vec = np.random.random((100)).astype("float64")
......@@ -57,21 +58,36 @@ class TestMVAPI(unittest.TestCase):
paddle.enable_static()
def test_static_graph(self):
paddle.enable_static()
for x_stop_gradient in [False, True]:
for vec_stop_gradient in [False, True]:
paddle.enable_static()
train_program = Program()
startup_program = Program()
self.input_x = np.random.rand(5, 100).astype("float64")
self.input_vec = np.random.rand(100).astype("float64")
with program_guard(train_program, startup_program):
data_x = paddle.static.data(
"x", shape=[5, 100], dtype="float64")
data_vec = paddle.static.data(
"vec", shape=[100], dtype="float64")
data_x.stop_gradient = x_stop_gradient
data_vec.stop_gradient = vec_stop_gradient
result_vec = paddle.mv(data_x, data_vec)
self.input_x = np.random.rand(5, 100).astype("float64")
self.input_vec = np.random.rand(100).astype("float64")
data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
result_vec = paddle.mv(data_x, data_vec)
self.place = paddle.CPUPlace()
exe = paddle.static.Executor(self.place)
res, = exe.run(feed={"x": self.input_x,
"vec": self.input_vec},
fetch_list=[result_vec])
z_expected = np.array(np.dot(self.input_x, self.input_vec))
self.assertTrue(np.allclose(res, z_expected))
self.place = paddle.CPUPlace()
exe = paddle.static.Executor(self.place)
res, = exe.run(
feed={"x": self.input_x,
"vec": self.input_vec},
fetch_list=[result_vec])
z_expected = np.array(np.dot(self.input_x, self.input_vec))
self.assertTrue(np.allclose(res, z_expected))
class TestMVError(unittest.TestCase):
......
......@@ -195,6 +195,23 @@ class TestPool1d_API(unittest.TestCase):
result = max_pool1d_dg(input)
self.assertTrue(np.allclose(result.numpy(), result_np))
def check_max_dygraph_return_index_results(self, place):
with fluid.dygraph.guard(place):
input_np = np.random.random([2, 3, 32]).astype("float32")
input = fluid.dygraph.to_variable(input_np)
result, index = F.max_pool1d(
input, kernel_size=2, stride=2, padding=0, return_indices=True)
result_np = max_pool1D_forward_naive(
input_np, ksize=[2], strides=[2], paddings=[0])
self.assertTrue(np.allclose(result.numpy(), result_np))
max_pool1d_dg = paddle.nn.layer.MaxPool1d(
kernel_size=2, stride=None, padding=0)
result = max_pool1d_dg(input)
self.assertTrue(np.allclose(result.numpy(), result_np))
def check_max_dygraph_padding_same(self, place):
with fluid.dygraph.guard(place):
input_np = np.random.random([2, 3, 32]).astype("float32")
......@@ -228,6 +245,7 @@ class TestPool1d_API(unittest.TestCase):
self.check_avg_static_results(place)
self.check_max_dygraph_padding_same(place)
self.check_avg_dygraph_padding_same(place)
self.check_max_dygraph_return_index_results(place)
class TestPool2dError_API(unittest.TestCase):
......
......@@ -63,7 +63,7 @@ class TestRandnOpForDygraph(unittest.TestCase):
dim_2 = paddle.fill_constant([1], "int32", 50)
x3 = paddle.randn(shape=[dim_1, dim_2, 784])
var_shape = paddle.to_variable(np.array(shape))
var_shape = paddle.to_tensor(np.array(shape))
x4 = paddle.randn(var_shape)
for out in [x1, x2, x3, x4]:
......
......@@ -105,8 +105,8 @@ class TestRetainGraph(unittest.TestCase):
A = np.random.rand(2, 3, 32, 32).astype('float32')
B = np.random.rand(2, 3, 32, 32).astype('float32')
realA = paddle.to_variable(A)
realB = paddle.to_variable(B)
realA = paddle.to_tensor(A)
realB = paddle.to_tensor(B)
fakeB = g(realA)
optim_d.clear_gradients()
......
......@@ -487,24 +487,24 @@ class TestTransformer(unittest.TestCase):
dropout=dropout,
weight_attr=[None],
bias_attr=[False])
src = paddle.to_variable(
src = paddle.to_tensor(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
tgt = paddle.to_tensor(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
src_mask = paddle.to_tensor(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
tgt_mask, memory_mask = paddle.to_tensor(
tgt_mask), paddle.to_tensor(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
......@@ -521,24 +521,24 @@ class TestTransformer(unittest.TestCase):
dropout=dropout,
weight_attr=[None, None],
bias_attr=[False, False])
src = paddle.to_variable(
src = paddle.to_tensor(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
tgt = paddle.to_tensor(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
src_mask = paddle.to_tensor(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
tgt_mask, memory_mask = paddle.to_tensor(
tgt_mask), paddle.to_tensor(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
......@@ -555,24 +555,24 @@ class TestTransformer(unittest.TestCase):
dropout=dropout,
weight_attr=[None, None, None],
bias_attr=[False, False, True])
src = paddle.to_variable(
src = paddle.to_tensor(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
tgt = paddle.to_tensor(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
src_mask = paddle.to_tensor(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
tgt_mask, memory_mask = paddle.to_tensor(
tgt_mask), paddle.to_tensor(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
......@@ -588,24 +588,24 @@ class TestTransformer(unittest.TestCase):
dim_feedforward=dim_feedforward,
dropout=dropout,
bias_attr=False)
src = paddle.to_variable(
src = paddle.to_tensor(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
tgt = paddle.to_tensor(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
src_mask = paddle.to_tensor(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
tgt_mask, memory_mask = paddle.to_tensor(
tgt_mask), paddle.to_tensor(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
......
......@@ -24,7 +24,7 @@ from paddle.fluid import Program, program_guard
import paddle
import paddle.nn.functional as F
CUDA_BLOCK_SIZE = 512
CUDA_BLOCK_SIZE = 32
class CTCForward(object):
......@@ -41,8 +41,8 @@ class CTCForward(object):
self.num_classes = num_classes
self.batch_size = batch_size
self.loss = np.zeros([self.batch_size, 1], dtype="float32")
self.gradient = np.zeros(self.softmax.shape, dtype="float32")
self.loss = np.zeros([self.batch_size, 1], dtype=softmax.dtype)
self.gradient = np.zeros(self.softmax.shape, dtype=softmax.dtype)
# float64
self.EXP_MAX = sys.float_info.max
......@@ -112,13 +112,15 @@ class CTCForward(object):
# calculate the forward and backward variables,
# reference Chapter 7.3 of "Alex Grave, Supervised Sequence
# Labelling with Recurrent Neural Networks"
log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
log_acts = np.zeros(
[total_times, self.num_classes], dtype=softmax_a_sequence.dtype)
for i in range(total_times):
for j in range(self.num_classes):
log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
# calculate the forward variables
forward_vars = np.zeros([total_times, total_segments], dtype="float32")
forward_vars = np.zeros(
[total_times, total_segments], dtype=softmax_a_sequence.dtype)
for i in range(total_times):
for j in range(total_segments):
forward_vars[i, j] = self.LOG_ZERO
......@@ -219,7 +221,7 @@ class TestWarpCTCOp(OpTest):
self.logits_lod[0][i])
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype="float32")
dtype=logits.dtype)
self.inputs = {
"Logits": (logits, self.logits_lod),
......@@ -287,7 +289,7 @@ class TestWarpCTCOpWithPadding(OpTest):
# reshape logits to T*N*S
new_logits = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype="float32")
dtype=logits.dtype)
cur = 0
for batch_id in range(self.batch_size):
......@@ -312,7 +314,7 @@ class TestWarpCTCOpWithPadding(OpTest):
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype="float32")
dtype=logits.dtype)
self.inputs = {
"Logits": new_logits,
......@@ -347,6 +349,90 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
self.norm_by_times = False
class TestWarpCTCOpFp64(OpTest):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
def setUp(self):
self.op_type = "warpctc"
self.config()
logits = np.random.uniform(
0.1, 1.0,
[sum(self.logits_length), self.num_classes]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
# labels should not be blank
labels = np.random.randint(
0,
self.num_classes - 1, [sum(self.labels_length), 1],
dtype="int32")
ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
self.num_classes, self.batch_size, self.blank,
self.norm_by_times)
loss = ctc.forward()
max_sequence_length = 0
for i in range(self.batch_size):
max_sequence_length = max(max_sequence_length,
self.logits_length[i])
# reshape logits to T*N*S
new_logits = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.logits_length[batch_id]):
for j in range(self.num_classes):
new_logits[i, batch_id, j] = logits[cur + i, j]
cur = cur + self.logits_length[batch_id]
# reshape labels to N*S
max_target_seq_length = 0
for i in range(self.batch_size):
max_target_seq_length = max(max_target_seq_length,
self.labels_length[i])
new_labels = np.zeros(
[self.batch_size, max_target_seq_length], dtype="int32")
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.labels_length[batch_id]):
new_labels[batch_id, i] = labels[cur + i]
cur = cur + self.labels_length[batch_id]
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
self.inputs = {
"Logits": new_logits,
"Label": new_labels,
"LogitsLength": self.logits_length,
"LabelLength": self.labels_length
}
self.outputs = {"Loss": loss}
self.attrs = {
"blank": self.blank,
"norm_by_times": self.norm_by_times,
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.outputs['WarpCTCGrad'] = self.gradient
self.check_grad(["Logits"], "Loss")
class TestWarpCTCOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
......@@ -359,7 +445,7 @@ class TestWarpCTCOpError(unittest.TestCase):
name='labels_length', shape=[None], dtype='int64')
def test_logits_Variable():
logits_data = np.random.rand(5, 16, 6).astype("float32")
logits_data = np.random.rand(5, 16, 6).astype(logits.dtype)
fluid.layers.warpctc(
input=logits_data,
label=label,
......@@ -398,6 +484,21 @@ class TestWarpCTCOpError(unittest.TestCase):
self.assertRaises(TypeError, test_label_len_Variable)
def test_dygraph_errors(self):
def test_dygraph_with_lod():
logits = np.random.uniform(0.1, 1.0, [20, 15]).astype("float32")
# labels should not be blank
labels = np.random.randint(0, 15 - 1, [15, 1], dtype="int32")
softmax = paddle.to_variable(logits)
labels = paddle.to_variable(labels)
fluid.layers.warpctc(input=softmax, label=labels)
paddle.disable_static()
self.assertRaises(ValueError, test_dygraph_with_lod)
paddle.enable_static()
class TestCTCLossAPICase(unittest.TestCase):
def test_functinal_api(self):
......
......@@ -63,7 +63,7 @@ class TestZerosLikeImpeartive(unittest.TestCase):
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
paddle.disable_static(place)
x = paddle.to_variable(np.ones(shape))
x = paddle.to_tensor(np.ones(shape))
for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
out = zeros_like(x, dtype)
self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
sys.path.append("..")
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from scipy.special import expit, erf
import paddle
import paddle.fluid as fluid
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.fluid import compiler, Program, program_guard
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUActivation(OpTest):
def setUp(self):
self.op_type = "exp"
self.init_dtype()
self.init_kernel_type()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.exp(x)
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place, atol=1e-3)
def init_kernel_type(self):
pass
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUSigmoid(TestXPUActivation):
def setUp(self):
self.op_type = "sigmoid"
self.init_dtype()
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
out = 1 / (1 + np.exp(-x))
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.01)
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUTanh(TestXPUActivation):
def setUp(self):
self.op_type = "tanh"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.tanh(x)
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUSqrt(TestXPUActivation):
def setUp(self):
self.op_type = "sqrt"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.sqrt(x)
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUAbs(TestXPUActivation):
def setUp(self):
self.op_type = "abs"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype)
# Because we set delta = 0.005 in calculating numeric gradient,
# if x is too small, such as 0.002, x_neg will be -0.003
# x_pos will be 0.007, so the numeric gradient is inaccurate.
# we should avoid this
x[np.abs(x) < 0.005] = 0.02
out = np.abs(x)
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPURelu(TestXPUActivation):
def setUp(self):
self.op_type = "relu"
self.init_dtype()
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.attrs = {'use_xpu': True}
self.inputs = {'X': x}
self.outputs = {'Out': out}
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUGelu(TestXPUActivation):
def setUp(self):
self.op_type = "gelu"
self.init_dtype()
approximate = False
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
out = gelu(x, approximate)
self.inputs = {'X': x}
self.outputs = {'Out': out}
self.attrs = {"approximate": approximate, 'use_xpu': True}
def gelu(x, approximate):
if approximate:
y_ref = 0.5 * x * (1.0 + np.tanh(
np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
else:
y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
return y_ref.astype(x.dtype)
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPULog(TestXPUActivation):
def setUp(self):
self.op_type = "log"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.log(x)
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUSquare(TestXPUActivation):
def setUp(self):
self.op_type = "square"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.square(x)
self.attrs = {'use_xpu': True}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUPow(TestXPUActivation):
def setUp(self):
self.op_type = "pow"
self.init_dtype()
x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
out = np.power(x, 3)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'factor': 3.0, 'use_xpu': True}
self.outputs = {'Out': out}
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
sys.path.append("..")
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
class TestElementwiseAddOp(OpTest):
def init_kernel_type(self):
self.use_mkldnn = False
def setUp(self):
self.op_type = "elementwise_add"
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def test_check_output(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
self.check_output(check_dygraph=(self.use_mkldnn == False))
def test_check_grad_normal(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
if self.dtype == np.float16:
return
self.check_grad(
['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
def test_check_grad_ingore_x(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
if self.dtype == np.float16:
return
self.check_grad(
['Y'],
'Out',
no_grad_set=set("X"),
check_dygraph=(self.use_mkldnn == False))
def test_check_grad_ingore_y(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
if self.dtype == np.float16:
return
self.check_grad(
['X'],
'Out',
no_grad_set=set('Y'),
check_dygraph=(self.use_mkldnn == False))
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.add(self.x, self.y)
def init_dtype(self):
self.dtype = np.float64
def init_axis(self):
self.axis = -1
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUElementwiseAddOp(OpTest):
def setUp(self):
self.op_type = "elementwise_add"
self.init_dtype()
self.init_input_output()
self.init_axis()
self.inputs = {'X': self.x, 'Y': self.y}
self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True}
self.outputs = {'Out': self.out}
def test_check_output(self):
if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
def test_check_grad_normal(self):
if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad_with_place(place, ['X', 'Y'], 'Out')
def test_check_grad_ingore_x(self):
if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad_with_place(place, ['Y'], 'Out')
def test_check_grad_ingore_y(self):
if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad_with_place(place, ['X'], 'Out')
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.add(self.x, self.y)
def init_dtype(self):
self.dtype = np.float32
def init_axis(self):
self.axis = -1
@skip_check_grad_ci(
reason="[skip shape check] Use y_shape(1) to test broadcast.")
class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(1).astype(self.dtype)
self.out = self.x + self.y
@skip_check_grad_ci(
reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(1, 1).astype(self.dtype)
self.out = self.x + self.y
class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.random((100, )).astype(self.dtype)
self.y = np.random.random((100, )).astype(self.dtype)
self.out = np.add(self.x, self.y)
class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(100, 2, 3).astype(self.dtype)
self.y = np.random.rand(100).astype(self.dtype)
self.out = self.x + self.y.reshape(100, 1, 1)
def init_axis(self):
self.axis = 0
class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 100, 3).astype(self.dtype)
self.y = np.random.rand(100).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 100, 1)
def init_axis(self):
self.axis = 1
class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 100).astype(self.dtype)
self.y = np.random.rand(100).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1, 100)
class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
self.y = np.random.rand(10, 12).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 10, 12, 1)
def init_axis(self):
self.axis = 1
class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
self.y = np.random.rand(100, 1).astype(self.dtype)
self.out = self.x + self.y.reshape(100, 1, 1, 1)
def init_axis(self):
self.axis = 0
class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(10, 3, 12).astype(self.dtype)
self.y = np.random.rand(10, 1, 12).astype(self.dtype)
self.out = self.x + self.y
class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
self.out = self.x + self.y
class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
self.out = self.x + self.y
class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 10, 12).astype(self.dtype)
self.y = np.random.rand(10, 12).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 10, 12)
def init_axis(self):
self.axis = 1
@skip_check_grad_ci(
reason="[skip shape check] Use y_shape(1) to test broadcast.")
class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(100, 1).astype(self.dtype)
self.y = np.random.rand(1).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1)
def init_axis(self):
self.axis = 1
class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(100, 2, 3).astype(self.dtype)
self.y = np.random.rand(100, 1, 1).astype(self.dtype)
self.out = self.x + self.y
def init_axis(self):
self.axis = -1
class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 100).astype(self.dtype)
self.y = np.random.rand(1, 1, 100).astype(self.dtype)
self.out = self.x + self.y
def init_axis(self):
self.axis = -1
class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
self.out = self.x + self.y
def init_axis(self):
self.axis = -1
class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(10, 12).astype(self.dtype)
self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
self.out = self.x + self.y
def init_axis(self):
self.axis = 2
class TestElementwiseAddOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# the input of elementwise_add must be Variable.
x1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
y1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
# the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
# float16 only can be set on GPU place
x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
class TestAddOp(unittest.TestCase):
def test_name(self):
with fluid.program_guard(fluid.Program()):
x = fluid.data(name="x", shape=[2, 3], dtype="float32")
y = fluid.data(name='y', shape=[2, 3], dtype='float32')
y_1 = paddle.add(x, y, name='add_res')
self.assertEqual(('add_res' in y_1.name), True)
def test_declarative(self):
with fluid.program_guard(fluid.Program()):
def gen_data():
return {
"x": np.array([2, 3, 4]).astype('float32'),
"y": np.array([1, 5, 2]).astype('float32')
}
x = fluid.data(name="x", shape=[3], dtype='float32')
y = fluid.data(name="y", shape=[3], dtype='float32')
z = paddle.add(x, y)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
z_expected = np.array([3., 8., 6.])
self.assertEqual((z_value == z_expected).all(), True)
def test_dygraph(self):
with fluid.dygraph.guard():
np_x = np.array([2, 3, 4]).astype('float64')
np_y = np.array([1, 5, 2]).astype('float64')
x = fluid.dygraph.to_variable(np_x)
y = fluid.dygraph.to_variable(np_y)
z = paddle.add(x, y)
np_z = z.numpy()
z_expected = np.array([3., 8., 6.])
self.assertEqual((np_z == z_expected).all(), True)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
sys.path.append("..")
import paddle.fluid.core as core
import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
BATCH_SIZE = 2
M = 3
N = 4
K = 5
if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
K = 1
if dim_X == 1:
if transpose_X:
shape_X = [M]
else:
shape_X = [K]
if dim_Y == 1:
if transpose_Y:
shape_Y = [N]
else:
shape_Y = [K]
if dim_X >= 2:
if transpose_X:
shape_X = [K, M]
else:
shape_X = [M, K]
if dim_X == 3:
shape_X = [BATCH_SIZE] + shape_X
if dim_Y >= 2:
if transpose_Y:
shape_Y = [N, K]
else:
shape_Y = [K, N]
if dim_Y == 3:
shape_Y = [BATCH_SIZE] + shape_Y
return shape_X, shape_Y
def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
"""Reference forward implementation using np.matmul."""
# np.matmul does not support the transpose flags, so we manually
# transpose X and Y appropriately.
if transpose_X:
if X.ndim == 1:
X = X.reshape((X.size, 1))
elif X.ndim == 2:
X = X.T
else:
dim = [i for i in range(len(X.shape))]
dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
X = np.transpose(X, tuple(dim))
if transpose_Y:
if Y.ndim == 1:
Y = Y.reshape((1, Y.size))
else:
dim = [i for i in range(len(Y.shape))]
dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
Y = np.transpose(Y, tuple(dim))
Out = np.matmul(X, Y)
if not Out.shape:
# We do not support 0-dimensional Tensors (scalars). So where
# np.matmul outputs a scalar, we must convert to a Tensor of
# shape (1, ) instead.
# Everywhere else, we are compatible with np.matmul.
Out = np.array([Out], dtype="float32")
return Out
class Generator(object):
def setUp(self):
self.op_type = "matmul"
X = np.random.random(self.shape_X).astype("float32")
Y = np.random.random(self.shape_Y).astype("float32")
Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
self.inputs = {'X': X, 'Y': Y}
self.attrs = {
'transpose_X': self.transpose_X,
'transpose_Y': self.transpose_Y
}
self.outputs = {'Out': Out}
def test_check_output(self):
self.check_output()
if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
self.inputs['Y'].shape) and self.inputs['X'].shape[
0] == self.inputs['Y'].shape[0]:
place = paddle.XPUPlace(0)
self.check_output_with_place(place, atol=1e-3)
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
self.inputs['Y'].shape) and self.inputs['X'].shape[
0] == self.inputs['Y'].shape[0]:
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
def test_check_grad_ignore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
self.inputs['Y'].shape) and self.inputs['X'].shape[
0] == self.inputs['Y'].shape[0]:
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'],
'Out',
max_relative_error=5e-2,
no_grad_set=set("X"))
def test_check_grad_ignore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
self.inputs['Y'].shape) and self.inputs['X'].shape[
0] == self.inputs['Y'].shape[0]:
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X'],
'Out',
max_relative_error=5e-2,
no_grad_set=set('Y'))
class TestMatmulOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The inputs type of matmul_op must be Variable.
input1 = 12
self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
# The inputs dtype of matmul_op must be float32, float64.
input2 = fluid.layers.data(
name='input2', shape=[10, 10], dtype="int32")
self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
input3 = fluid.layers.data(
name='input3', shape=[2, 2], dtype="float16")
fluid.layers.matmul(input3, input3)
# Negative dimension generation
def generate_negative_dims(in_shape):
from itertools import combinations
size = len(in_shape)
indexs = list()
shapes = list()
for i in range(size):
indexs.extend(list(combinations([j for j in range(size)], i + 1)))
for idx in indexs:
shapes.append(
[in_shape[i] if i not in idx else -1 for i in range(size)])
return shapes
# Build program with inputs sizes that contain negative numbers
def test_negative_dims_program(obj):
for shape_x in generate_negative_dims(obj.shape_X):
for shape_y in generate_negative_dims(obj.shape_Y):
X = np.random.random(obj.shape_X).astype("float32")
Y = np.random.random(obj.shape_Y).astype("float32")
Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
with program_guard(Program(), Program()):
x = fluid.data(name='x', shape=shape_x, dtype='float32')
y = fluid.data(name='y', shape=shape_y, dtype='float32')
output = fluid.layers.matmul(x, y, obj.transpose_X,
obj.transpose_Y)
obj.assertEqual(len(Ref.shape), len(output.shape))
for idx in range(len(Ref.shape)):
if output.shape[idx] != -1:
obj.assertEqual(Ref.shape[idx], output.shape[idx])
exe = fluid.Executor(fluid.CPUPlace())
res, = exe.run(fluid.default_main_program(),
feed={'x': X,
'y': Y},
fetch_list=[output])
np.allclose(res, Ref, atol=1e-5)
# Generate program api cases for all negative possibilities
def api_test(dim_x, dim_y, trans_x, trans_y):
test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y))
shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
trans_y)
globals()[test_name] = type(test_name, (unittest.TestCase, ), {
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
'test_propram': test_negative_dims_program,
})
# Generate operators cases for all possibilities
def inject_test(dim_x, dim_y, trans_x, trans_y):
test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y))
shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
trans_y)
globals()[test_name] = type(test_name, (Generator, OpTest), {
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
})
for dim_X in (1, 2, 3):
for dim_Y in (1, 2, 3):
for transose_x in (False, True):
for transose_y in (False, True):
inject_test(dim_X, dim_Y, transose_x, transose_y)
api_test(dim_X, dim_Y, transose_x, transose_y)
# Test case n-dim
def generate_compatible_shapes(dim, transpose_X, transpose_Y):
M = 2
N = 4
K = 3
shape_X = [2 for _ in range(dim - 2)]
shape_Y = [2 for _ in range(dim - 2)]
if transpose_X:
shape_X += [K, M]
else:
shape_X += [M, K]
if transpose_Y:
shape_Y += [N, K]
else:
shape_Y += [K, N]
return shape_X, shape_Y
# # Test case n-dim
for dim in [4]:
for transpose_X in [False, True]:
for transpose_Y in [False, True]:
test_name = (
'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim, dim, transpose_X, transpose_Y))
shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X,
transpose_Y)
globals()[test_name] = type(test_name, (Generator, OpTest), {
'shape_X': shape_X,
'shape_Y': shape_Y,
'transpose_X': transpose_X,
'transpose_Y': transpose_Y,
})
class API_TestMm(unittest.TestCase):
def test_out(self):
with fluid.program_guard(fluid.Program()):
x = fluid.data(name="x", shape=[2], dtype="float64")
y = fluid.data(name='y', shape=[2], dtype='float64')
res = fluid.data(name="output", shape=[1], dtype="float64")
result = paddle.mm(x, y)
exe = fluid.Executor(fluid.CPUPlace())
data1 = np.random.rand(2)
data2 = np.random.rand(2)
np_res = exe.run(feed={'x': data1, 'y': data2}, fetch_list=[result])
expected_result = np.matmul(
data1.reshape(1, 2), data2.reshape(2, 1))
self.assertTrue(
np.allclose(
np_res, expected_result, atol=1e-5),
"two value is\
{}\n{}, check diff!".format(np_res, expected_result))
def test_dygraph_without_out(self):
device = fluid.CPUPlace()
with fluid.dygraph.guard(device):
input_array1 = np.random.rand(3, 4).astype("float64")
input_array2 = np.random.rand(4, 3).astype("float64")
data1 = fluid.dygraph.to_variable(input_array1)
data2 = fluid.dygraph.to_variable(input_array2)
out = paddle.mm(data1, data2)
expected_result = np.matmul(input_array1, input_array2)
self.assertTrue(np.allclose(expected_result, out.numpy()))
class Test_API_Matmul(unittest.TestCase):
def test_dygraph_without_out(self):
device = fluid.CPUPlace()
with fluid.dygraph.guard(device):
input_array1 = np.random.rand(3, 4).astype("float64")
input_array2 = np.random.rand(4, 3).astype("float64")
data1 = fluid.dygraph.to_variable(input_array1)
data2 = fluid.dygraph.to_variable(input_array2)
out = paddle.matmul(data1, data2)
expected_result = np.matmul(input_array1, input_array2)
self.assertTrue(np.allclose(expected_result, out.numpy()))
class API_TestMmError(unittest.TestCase):
def test_errors(self):
def test_error1():
with fluid.program_guard(fluid.Program(), fluid.Program()):
data1 = fluid.data(name="data1", shape=[10, 2], dtype="float32")
data2 = fluid.data(name="data2", shape=[3, 10], dtype="float32")
paddle.mm(data1, data2)
self.assertRaises(ValueError, test_error1)
def test_error2():
with fluid.program_guard(fluid.Program(), fluid.Program()):
data1 = fluid.data(
name="data1", shape=[-1, 10, 2], dtype="float32")
data2 = fluid.data(
name="data2", shape=[-1, 2, 10], dtype="float32")
paddle.mm(data1, data2)
test_error2()
def test_error3():
with fluid.program_guard(fluid.Program(), fluid.Program()):
data1 = fluid.data(
name="data1", shape=[10, 10, 2], dtype="float32")
data2 = fluid.data(
name="data2", shape=[3, 2, 10], dtype="float32")
paddle.mm(data1, data2)
self.assertRaises(ValueError, test_error3)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import sys
sys.path.append("..")
from op_test import OpTest
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
class TestMulOp(OpTest):
def setUp(self):
self.op_type = "mul"
self.dtype = np.float64
self.init_dtype_type()
self.inputs = {
'X': np.random.random((20, 5)).astype(self.dtype),
'Y': np.random.random((5, 21)).astype(self.dtype)
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
class TestMulOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The input type of mul_op must be Variable.
x1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
x2 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
# The input dtype of mul_op must be float32 or float64.
x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
self.assertRaises(TypeError, fluid.layers.mul, x3, x4)
class TestMulOp2(OpTest):
def setUp(self):
self.op_type = "mul"
self.dtype = np.float64
self.init_dtype_type()
self.inputs = {
'X': np.random.random((3, 4, 2, 9)).astype(self.dtype),
'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype)
}
self.attrs = {
'x_num_col_dims': 2,
'y_num_col_dims': 2,
}
result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9),
self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3))
result = result.reshape(3, 4, 1, 2, 3)
self.outputs = {'Out': result}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
def test_check_grad_ignore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUMulOp1(TestMulOp):
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
place = paddle.XPUPlace(0)
self.check_output_with_place(place, atol=1e-1)
def test_check_grad_normal(self):
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.5)
def test_check_grad_ingore_x(self):
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestXPUMulOp2(TestMulOp2):
def init_dtype_type(self):
self.dtype = np.float32
def test_check_output(self):
place = paddle.XPUPlace(0)
self.check_output_with_place(place, atol=2e-1)
def test_check_grad_normal(self):
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X', 'Y'], 'Out', max_relative_error=0.9)
def test_check_grad_ingore_x(self):
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
if __name__ == "__main__":
unittest.main()
......@@ -16,12 +16,11 @@ from .metrics import *
from . import metrics
from ..fluid.layers.metric_op import accuracy, auc
from ..fluid.layers.nn import chunk_eval, cos_sim, mean_iou
from ..fluid.layers.nn import chunk_eval, mean_iou
__all__ = metrics.__all__ + [
'accuracy',
'auc',
'chunk_eval',
'cos_sim',
'mean_iou',
]
......@@ -38,11 +38,9 @@ from .clip import GradientClipByValue #DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS
from .clip import clip #DEFINE_ALIAS
from .clip import clip_by_norm #DEFINE_ALIAS
from .control_flow import case #DEFINE_ALIAS
from .control_flow import cond #DEFINE_ALIAS
# from .control_flow import DynamicRNN #DEFINE_ALIAS
# from .control_flow import StaticRNN #DEFINE_ALIAS
from .control_flow import switch_case #DEFINE_ALIAS
from .control_flow import while_loop #DEFINE_ALIAS
# from .control_flow import rnn #DEFINE_ALIAS
# from .decode import BeamSearchDecoder #DEFINE_ALIAS
......
......@@ -13,18 +13,13 @@
# limitations under the License.
# TODO: define the control flow api
from ..fluid.layers import case #DEFINE_ALIAS
from ..fluid.layers import cond #DEFINE_ALIAS
from ..fluid.layers import while_loop #DEFINE_ALIAS
from ..fluid.layers import switch_case #DEFINE_ALIAS
__all__ = [
'case',
'cond',
# 'DynamicRNN',
# 'StaticRNN',
'switch_case',
'while_loop',
# 'rnn'
]
......@@ -933,7 +933,7 @@ def ctc_loss(log_probs,
is interated to the Warp-CTC library to normalize values for each row of the input tensor.
Parameters:
log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
......
......@@ -571,15 +571,26 @@ def max_pool1d(x,
padding = _expand_low_nd_padding(padding)
if in_dygraph_mode():
pool_out = core.ops.max_pool2d_with_index(
x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
'paddings', padding, 'padding_algorithm', padding_algorithm,
'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
'exclusive', True, 'data_format', data_format)
return (squeeze(pool_out[0], [2]), squeeze(
pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
if return_indices:
pool_out = core.ops.max_pool2d_with_index(
x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
stride, 'paddings', padding, 'padding_algorithm',
padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
'use_mkldnn', False, 'exclusive', True, 'data_format',
data_format)
return (squeeze(pool_out[0], [2]), squeeze(
pool_out[1],
[2])) if return_indices else squeeze(pool_out[0], [2])
else:
pool_out = core.ops.pool2d(
x, 'pooling_type', 'max', 'ksize', kernel_size,
'global_pooling', False, 'padding_algorithm', padding_algorithm,
'strides', stride, 'paddings', padding, 'use_cudnn', True,
'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
'data_format', data_format)
return squeeze(pool_out, [2])
op_type = 'max_pool2d_with_index'
op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype()
pool_out = helper.create_variable_for_type_inference(dtype)
......@@ -696,7 +707,7 @@ def max_pool2d(x,
)
if in_dygraph_mode():
if data_format == "NCHW":
if return_indices:
output = core.ops.max_pool2d_with_index(
x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
stride, 'paddings', padding, 'padding_algorithm',
......@@ -704,7 +715,7 @@ def max_pool2d(x,
'use_mkldnn', False, 'exclusive', True, 'data_format',
data_format)
return output if return_indices else output[0]
elif data_format == "NHWC" and not return_indices:
else:
output = core.ops.pool2d(
x, 'pooling_type', 'max', 'ksize', kernel_size,
'global_pooling', False, 'padding_algorithm', padding_algorithm,
......@@ -713,7 +724,7 @@ def max_pool2d(x,
'data_format', data_format)
return output
op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype()
pool_out = helper.create_variable_for_type_inference(dtype)
......@@ -822,7 +833,7 @@ def max_pool3d(x,
)
if in_dygraph_mode():
if data_format == "NCDHW":
if return_indices:
output = core.ops.max_pool3d_with_index(
x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
stride, 'paddings', padding, 'global_pooling', False,
......@@ -830,7 +841,7 @@ def max_pool3d(x,
'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
'data_format', data_format)
return output if return_indices else output[0]
elif data_format == "NDHWC" and not return_indices:
else:
output = core.ops.pool3d(
x, 'pooling_type', 'max', 'ksize', kernel_size,
'global_pooling', False, 'padding_algorithm', padding_algorithm,
......@@ -839,7 +850,7 @@ def max_pool3d(x,
'data_format', data_format)
return output
op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
op_type = "max_pool3d_with_index" if return_indices else "pool3d"
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype()
pool_out = helper.create_variable_for_type_inference(dtype)
......
......@@ -248,7 +248,7 @@ class Conv1d(_ConvNd):
padding = 0
if self._padding_mode != "zeros":
x = F.pad(x,
self._padding,
self._reversed_padding_repeated_twice,
mode=self._padding_mode,
data_format=self._data_format)
else:
......
......@@ -773,7 +773,7 @@ class CTCLoss(fluid.dygraph.Layer):
reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
Shape:
log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
......
......@@ -17,6 +17,7 @@ __all__ = [
'batch_norm',
'embedding',
'bilinear_tensor_product',
'case',
'conv2d',
'conv2d_transpose',
'conv3d',
......@@ -34,11 +35,13 @@ __all__ = [
'prelu',
'row_conv',
'spectral_norm',
'switch_case',
]
from ...fluid.layers import fc #DEFINE_ALIAS
from ...fluid.layers import batch_norm #DEFINE_ALIAS
from ...fluid.layers import bilinear_tensor_product #DEFINE_ALIAS
from ...fluid.layers import case #DEFINE_ALIAS
from ...fluid.layers import conv2d #DEFINE_ALIAS
from ...fluid.layers import conv2d_transpose #DEFINE_ALIAS
from ...fluid.layers import conv3d #DEFINE_ALIAS
......@@ -56,5 +59,6 @@ from ...fluid.layers import nce #DEFINE_ALIAS
from ...fluid.layers import prelu #DEFINE_ALIAS
from ...fluid.layers import row_conv #DEFINE_ALIAS
from ...fluid.layers import spectral_norm #DEFINE_ALIAS
from ...fluid.layers import switch_case #DEFINE_ALIAS
from ...fluid.input import embedding #DEFINE_ALIAS
......@@ -707,20 +707,14 @@ def cross(x, y, axis=None, name=None):
Examples:
.. code-block:: python
import paddle
from paddle import to_variable
import numpy as np
paddle.disable_static()
data_x = np.array([[1.0, 1.0, 1.0],
[2.0, 2.0, 2.0],
[3.0, 3.0, 3.0]])
data_y = np.array([[1.0, 1.0, 1.0],
[1.0, 1.0, 1.0],
[1.0, 1.0, 1.0]])
x = to_variable(data_x)
y = to_variable(data_y)
x = paddle.to_tensor([[1.0, 1.0, 1.0],
[2.0, 2.0, 2.0],
[3.0, 3.0, 3.0]])
y = paddle.to_tensor([[1.0, 1.0, 1.0],
[1.0, 1.0, 1.0],
[1.0, 1.0, 1.0]])
z1 = paddle.cross(x, y)
print(z1.numpy())
# [[-1. -1. -1.]
......
......@@ -1650,12 +1650,11 @@ def cumsum(x, axis=None, dtype=None, name=None):
.. code-block:: python
import paddle
from paddle import to_variable
import numpy as np
paddle.disable_static()
data_np = np.arange(12).reshape(3, 4)
data = to_variable(data_np)
data = paddle.to_tensor(data_np)
y = paddle.cumsum(data)
print(y.numpy())
......
......@@ -286,7 +286,7 @@ fi
# Get the list of PR authors with unresolved unit test issues
pip install PyGithub
# For getting PR related data
wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
if [ "${HASUTFIXED}" != "" ]; then
......
......@@ -251,9 +251,10 @@
"BilinearTensorProduct",
"GroupNorm",
"SpectralNorm",
"TreeConv",
"TreeConv"
],
"wlist_temp":[
"prroi_pool",
"to_tensor",
"ChunkEvaluator",
"EditDistance",
"ErrorClipByValue",
......@@ -406,7 +407,9 @@
"TransformerDecoder.prepare_incremental_cache",
"LinearChainCRF.forward",
"CRFDecoding.forward",
"SequenceTagging.forward"
"SequenceTagging.forward",
"XPUPlace",
"is_compiled_with_xpu"
],
"gpu_not_white":[
"deformable_conv",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册