提交 83c2d35a 编写于 作者: S seiriosPlus

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize/large_scale_kv_spped

......@@ -16,7 +16,7 @@ else()
set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
endif()
######################################################################################
......
......@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
SET(DGC_URL "http://fleet.bj.bcebos.com/collective_ef2216a.tgz")
SET(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
cache_third_party(extern_dgc
......@@ -30,7 +30,7 @@ ExternalProject_Add(
extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS}
"${DGC_DOWNLOAD_CMD}"
URL_MD5 "2f67549fd5f1262383d83289abc4f88f"
URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
PREFIX "${DGC_PREFIX_DIR}"
SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND ""
......
......@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
endif()
if(NOT CUDA_ARCH_NAME)
......
......@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn)
SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git)
SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
SET(MKLDNN_REPOSITORY https://github.com/oneapi-src/oneDNN.git)
SET(MKLDNN_TAG 64a48f9565aa72f6359917b3406328075a409939)
# Introduce variables:
# * CMAKE_INSTALL_LIBDIR
......
......@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git)
set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
......
......@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
endfunction()
CheckCompilerCXX11Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
# safe_set_flag
#
# Set a compile flag only if compiler is support
......
......@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
ENDIF()
if(WITH_GPU)
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU)
......
......@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
for (size_t i = 0; i < op_num; ++i) {
PADDLE_ENFORCE_EQ(
grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
"Cannot serialize message.");
paddle::platform::errors::Unavailable(
"Cannot serialize operator desc message."));
}
}
return ret;
......
......@@ -36,7 +36,10 @@ message AMPConfig {
repeated string custom_black_varnames = 9;
}
message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
message LocalSGDConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ];
......@@ -52,6 +55,8 @@ message DGCConfig {
message LarsConfig {
optional float lars_coeff = 1 [ default = 0.001 ];
optional float lars_weight_decay = 2 [ default = 0.0005 ];
optional float epsilon = 3 [ default = 0.0 ];
repeated string exclude_from_weight_decay = 4;
}
message LambConfig {
......
......@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
void NCCLWrapper::InitNCCL() {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
nccl_info_.my_global_rank_));
#endif
......@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
NCCLInfo NCCLWrapper::GetNCCLId() {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
#endif
return nccl_info_;
}
......@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
nccl_info_.local_rank_ = local_rank;
nccl_info_.my_global_rank_ = global_rank;
nccl_info_.global_ranks_ = ranks;
PADDLE_ENFORCE(cudaSetDevice(local_rank));
PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
#endif
return;
}
......@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
auto var = scope.FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int32_t total_size = tensor->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
root_rank, nccl_info_.comm_, nccl_info_.stream_));
cudaStreamSynchronize(nccl_info_.stream_);
......
......@@ -42,7 +42,8 @@ void ThreadPool::Init() {
num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
"The number of threads is 0."));
threadpool_.reset(new ThreadPool(num_threads));
}
}
......@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
}
if (tasks_.empty()) {
PADDLE_THROW("This thread has no task to Run");
PADDLE_THROW(platform::errors::Unavailable(
"Current thread has no task to Run."));
}
// pop a task from the task queue
......
......@@ -91,7 +91,8 @@ class ThreadPool {
{
std::unique_lock<std::mutex> lock(mutex_);
if (!running_) {
PADDLE_THROW("enqueue on stopped ThreadPool");
PADDLE_THROW(platform::errors::Unavailable(
"Task is enqueued into stopped ThreadPool."));
}
tasks_.push(std::move(task));
}
......
......@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
} break;
default:
PADDLE_THROW(
"Setting 'sub_tensor_number' is not supported by the type of var %s.",
this->Name());
platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
"supported by the %s type variable.",
this->Name()));
}
}
......@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
break;
default:
PADDLE_THROW(
"Getting 'sub_tensor_number' is not supported by the type of var %s.",
this->Name());
platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
"supported by the %s type variable.",
this->Name()));
}
}
......@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
PADDLE_THROW(
"Setting 'lod_level' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_level' is not supported by the %s type variable.",
this->Name()));
}
}
......@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
}
} break;
default:
PADDLE_THROW(
"Setting 'lod_levels' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_levels' is not supported by the %s type variable",
this->Name()));
}
}
......@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().lod_level();
default:
PADDLE_THROW(
"Getting 'lod_level' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_level' is not supported by the %s type variable.",
this->Name()));
}
}
......@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
return res;
break;
default:
PADDLE_THROW(
"Getting 'lod_levels' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_levels' is not supported by the %s type variable.",
this->Name()));
}
}
const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS:
return desc_.type().selected_rows();
......@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().tensor();
default:
PADDLE_THROW(
"Getting 'tensor_desc' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_desc' is not supported by the %s type variable.",
this->Name()));
}
}
std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc> res;
res.reserve(GetTensorDescNum());
switch (desc_.type().type()) {
......@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
}
return res;
default:
PADDLE_THROW(
"Getting 'tensor_descs' is not supported by the type of var "
"%s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the %s type variable.",
this->Name()));
}
}
proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS:
return desc_.mutable_type()->mutable_selected_rows();
......@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
default:
PADDLE_THROW(
"Getting 'mutable_tensor_desc' is not supported by the type of var "
"%s.",
this->Name());
platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
"supported by the %s type variable.",
this->Name()));
}
}
std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc *> res;
res.reserve(GetTensorDescNum());
switch (desc_.type().type()) {
......@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
}
return res;
default:
PADDLE_THROW(
"Getting 'tensor_descs' is not supported by the type of var "
"%s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the %s type variable.",
this->Name()));
}
}
......
......@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type);
default:
PADDLE_THROW("ToVarType:Unsupported type %d", type);
PADDLE_THROW(platform::errors::Unavailable(
"ToVarType method Unsupported type %d.", type));
}
}
......@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
visitor(var.Get<FetchList>());
return;
default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
ToTypeName(var.Type())));
}
}
......
......@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
constexpr int kId = VarTypeTrait<Type>::kId;
auto type = std::type_index(typeid(Type));
PADDLE_ENFORCE(id_to_type->count(kId) == 0,
"Registered duplicate type id %d for type %s", kId,
type.name());
PADDLE_ENFORCE(type_to_id->count(type) == 0,
"Registered duplicate type_index %s for id %d", type.name(),
kId);
PADDLE_ENFORCE_EQ(
id_to_type->count(kId), 0,
platform::errors::AlreadyExists(
"Registered duplicate type id %d for type %s.", kId, type.name()));
PADDLE_ENFORCE_EQ(
type_to_id->count(type), 0,
platform::errors::AlreadyExists(
"Registered duplicate type index %s for id %d.", type.name(), kId));
id_to_type->emplace(kId, type);
type_to_id->emplace(type, kId);
VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
......@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
public:
static const std::type_index &ToTypeIndex(int var_id) {
auto it = Instance().id_to_type_map_.find(var_id);
PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
"VarId %d is not registered.", var_id);
PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
platform::errors::NotFound(
"Variable Id %d is not registered.", var_id));
return it->second;
}
static int ToTypeId(const std::type_index &type) {
auto it = Instance().type_to_id_map_.find(type);
PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
"VarType %s is not registered.", type.name());
PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
platform::errors::NotFound(
"Variable Type %s is not registered.", type.name()));
return it->second;
}
......
......@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
PADDLE_THROW(platform::errors::Unavailable(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type);
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
var_type));
}
}
......@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
auto *dst_t = tmp_grad_slr->mutable_value();
framework::TensorCopy(src_t, cpu_place, dst_t);
} else {
PADDLE_THROW("unknown var type to copy");
PADDLE_THROW(
platform::errors::Unavailable("Unknown variable type to copy."));
}
}
......
......@@ -218,6 +218,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
// Only used in paddle-lite subgraph.
DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
int);
private:
std::unordered_set<std::string> valid_fields_;
};
......
......@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_xpu", new bool(argument->use_xpu()));
pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size()));
pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads()));
}
disable_logs_ = argument->disable_logs();
if (pass_name == "fc_fuse_pass") {
......
......@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
lite_api::TargetType target_type;
if (use_gpu) {
......@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
// Notice: The ordering here determines the device where the
// input tensor of the Lite engine is located, and then affects
// whether tensor sharing is feasible.
paddle::lite::Place({target_type, precision_type}),
paddle::lite::Place({target_type, PRECISION(kInt64)}),
paddle::lite::Place({target_type, PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
paddle::lite_api::Place({target_type, precision_type}),
paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
};
config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size;
if (dump_model) {
lite::StrToBinaryFile("./model.bin", config.model);
......
......@@ -53,12 +53,10 @@ if(WITH_TESTING)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
elseif(WIN32)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
endif()
endif()
......
......@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() {
}
if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads());
argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
argument_.SetLitePassesFilter(config_.lite_passes_filter_);
argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
......
......@@ -21,15 +21,21 @@
namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
PADDLE_ENFORCE(!name_.empty(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(input_or_output_,
"Can't reshape the output tensor, it is readonly");
PADDLE_ENFORCE(scope_);
PADDLE_ENFORCE_EQ(
name_.empty(), false,
platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."));
PADDLE_ENFORCE_EQ(input_or_output_, true,
platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly"));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(shape));
}
......@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GT(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before retrieving mutable_data from input tensor.");
platform::errors::PreconditionNotMet(
"You should call ZeroCopyTensor::Reshape(const std::vector<int> "
"&shape)"
"function before retrieving mutable_data from input tensor."));
switch (static_cast<int>(place)) {
case static_cast<int>(PaddlePlace::kCPU): {
return tensor->mutable_data<T>(platform::CPUPlace());
......@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
return tensor->mutable_data<T>(platform::CUDAPlace(device_));
}
default:
PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
static_cast<int>(place)));
break;
}
return nullptr;
......@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copying data from cpu.");
PADDLE_ENFORCE_GE(tensor->numel(), 0,
platform::errors::PreconditionNotMet(
"You should call ZeroCopyTensor::Reshape(const "
"std::vector<int> &shape)"
"function before copying data from cpu."));
size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) {
......@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream());
#else
PADDLE_THROW("Not compiled with CUDA, should not reach here.");
PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with CUDA, should not reach here."));
#endif
}
}
......@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
cudaStreamSynchronize(dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here."));
#endif
}
}
......@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(scope_);
PADDLE_ENFORCE_EQ(
name_.empty(), false,
platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>();
return tensor;
}
std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
PADDLE_ENFORCE_NOT_NULL(
tensor_, platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_));
return framework::vectorize<int>(tensor->dims());
}
......
......@@ -31,12 +31,30 @@ limitations under the License. */
#include "paddle_analysis_config.h" // NOLINT
#include "paddle_api.h" // NOLINT
///
/// \file paddle_inference_api.h
///
/// \brief Paddle Inference API
///
/// \author paddle-infer@baidu.com
/// \date 2020-09-01
/// \since 2.0.0-beta
///
namespace paddle_infer {
using DataType = paddle::PaddleDType;
using PlaceType = paddle::PaddlePlace;
using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig;
///
/// \class Tensor
///
/// \brief Represents an n-dimensional array of values.
/// The Tensor is used to store the input or output of the network.
/// It is obtained through Predictor::GetinputHandle()
/// and Predictor::GetOutputHandle() interface.
///
class PD_INFER_DECL Tensor {
public:
// Can only be created by predictor->GetInputHandle(cosnt std::string& name)
......@@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor {
Tensor() = delete;
explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
: tensor_(std::move(tensor)) {}
///
/// \brief Reset the shape of the tensor.
/// Generally it's only used for the input tensor.
/// Reshape must be called before calling mutable_data() or CopyFromCpu()
/// \param shape The shape to set.
///
void Reshape(const std::vector<int>& shape);
///
/// \brief Copy the host memory to tensor data.
/// It's usually used to set the input tensor data.
/// \param data The pointer of the data, from which the tensor will copy.
///
template <typename T>
void CopyFromCpu(const T* data);
// should add the place
///
/// \brief Get the memory pointer in CPU or GPU with specific data type.
/// Please Reshape the tensor first before call this.
/// It's usually used to get input data pointer.
/// \param place The place of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T>
T* mutable_data(PlaceType place);
///
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
///
template <typename T>
void CopyToCpu(T* data);
///
/// \brief Get the memory pointer directly.
/// It's usually used to get the output data pointer.
/// \param[out] place To get the device type of the tensor.
/// \param[out] size To get the data size of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T>
T* data(PlaceType* place, int* size) const;
///
/// \brief Set lod info of the tensor.
/// More about LOD can be seen here:
/// https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
/// \param x the lod info.
///
void SetLoD(const std::vector<std::vector<size_t>>& x);
/// \brief Return the lod info of the tensor.
std::vector<std::vector<size_t>> lod() const;
/// \brief Return the data type of the tensor.
/// It's usually used to get the output tensor data type.
/// \return The data type of the tensor.
DataType type() const;
/// \brief Return the shape of the Tensor.
std::vector<int> shape() const;
/// \brief Return the name of the tensor.
const std::string& name() const;
private:
std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
};
///
/// \class Predictor
///
/// \brief Predictor is the interface for model prediction.
///
/// The predictor has the following typical uses:
///
/// Get predictor
/// \code{cpp}
/// auto predictor = CreatePredictor(config);
/// \endcode
///
/// Get input or output names
/// \code{cpp}
/// auto input_names = predictor->GetInputNames();
/// auto output_names = predictor->GetOutputNames();
/// \endcode
///
/// Get input or output handle
/// \code{cpp}
/// auto input_t = predictor->GetInputHandle(input_names[0]);
/// auto output_t = predictor->GetOutputHandle(output_names[0]);
/// \endcode
///
/// Run predictor
/// \code{cpp}
/// predictor->Run();
/// \endcode
///
class PD_INFER_DECL Predictor {
public:
Predictor() = delete;
......@@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor {
explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
: predictor_(std::move(pred)) {}
///
/// \brief Construct a new Predictor object
///
/// \param[in] Config config
///
explicit Predictor(const Config& config);
///
/// \brief Get the input names
///
/// \return input names
///
std::vector<std::string> GetInputNames();
///
/// \brief Get the Input Tensor object
///
/// \param[in] name input name
/// \return input tensor
///
std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
///
/// \brief Run the prediction engine
///
/// \return Whether the function executed successfully
///
bool Run();
///
/// \brief Get the output names
///
/// \return output names
///
std::vector<std::string> GetOutputNames();
///
/// \brief Get the Output Tensor object
///
/// \param[in] name otuput name
/// \return output tensor
///
std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
///
/// \brief Clone to get the new predictor. thread safe.
///
/// \return get a new predictor
///
std::unique_ptr<Predictor> Clone();
/// \brief Clear the intermediate tensors of the predictor
void ClearIntermediateTensor();
private:
std::unique_ptr<paddle::PaddlePredictor> predictor_;
};
///
/// \brief A factory to help create predictors.
///
/// Usage:
///
/// \code{.cpp}
/// Config config;
/// ... // change the configs.
/// auto predictor = CreatePredictor(config);
/// \endcode
///
PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
const Config& config); // NOLINT
PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
PD_INFER_DECL std::string GetVersion();
......@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
namespace paddle_infer {
namespace services {
///
/// \class PredictorPool
///
/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
/// use in multi-threaded situations. According to the thread id, the
/// corresponding Predictor is taken out from PredictorPool to complete the
/// prediction.
///
class PD_INFER_DECL PredictorPool {
public:
PredictorPool() = delete;
PredictorPool(const PredictorPool&) = delete;
PredictorPool& operator=(const PredictorPool&) = delete;
/// \brief Construct the predictor pool with \param size predictor instances.
explicit PredictorPool(const Config& config, size_t size = 1);
/// \brief Get \param id-th predictor.
Predictor* Retrive(size_t idx);
private:
......
......@@ -16,6 +16,7 @@
#include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
}
void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Resize(length);
}
void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Reset(data, length);
}
bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.empty();
}
void* PD_PaddleBufData(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.data();
}
size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.length();
}
......
......@@ -18,7 +18,6 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using PD_PaddleDType = paddle::PaddleDType;
using PD_ACPrecision = paddle::AnalysisConfig::Precision;
......
......@@ -20,6 +20,7 @@
#include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
const char* params_path) {
LOG(INFO) << model_dir;
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
LOG(INFO) << std::string(model_dir);
if (!params_path) {
config->config.SetModel(std::string(model_dir));
......@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
}
void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetProgFile(std::string(x));
}
void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetParamsFile(std::string(x));
}
void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetOptimCacheDir(std::string(opt_cache_dir));
}
const char* PD_ModelDir(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_dir().c_str();
}
const char* PD_ProgFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.prog_file().c_str();
}
const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.params_file().c_str();
}
void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
int device_id) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
device_id);
}
void PD_DisableGpu(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.DisableGpu();
}
bool PD_UseGpu(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_gpu();
}
int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.gpu_device_id();
}
int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.memory_pool_init_size_mb();
}
float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.fraction_of_gpu_memory_for_pool();
}
void PD_EnableCUDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableCUDNN();
}
bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cudnn_enabled();
}
void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrOptim(x);
}
bool PD_IrOptim(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.ir_optim();
}
void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchUseFeedFetchOps(x);
}
bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_feed_fetch_ops_enabled();
}
void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchSpecifyInputNames(x);
}
bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.specify_input_name();
}
......@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
int max_batch_size, int min_subgraph_size,
Precision precision, bool use_static,
bool use_calib_mode) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableTensorRtEngine(
workspace_size, max_batch_size, min_subgraph_size,
paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
}
bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.tensorrt_engine_enabled();
}
void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrDebug(x);
}
void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMKLDNN();
}
void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetMkldnnCacheCapacity(capacity);
}
bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_enabled();
}
void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
int cpu_math_library_num_threads) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
}
int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cpu_math_library_num_threads();
}
void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnQuantizer();
}
bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_quantizer_enabled();
}
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnBfloat16();
}
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_bfloat16_enabled();
}
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
params_buffer_size);
}
bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_from_memory();
}
void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMemoryOptim();
}
bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.enable_memory_optim();
}
void PD_EnableProfile(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableProfile();
}
bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.profile_enabled();
}
void PD_SetInValid(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetInValid();
}
bool PD_IsValid(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.is_valid();
}
......
......@@ -22,6 +22,7 @@
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -81,7 +82,10 @@ extern "C" {
bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
int in_size, PD_Tensor** output_data, int* out_size,
int batch_size) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
VLOG(3) << "Predoctor: PD_PredictorRun. ";
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors;
......@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
PD_ZeroCopyData* inputs, int in_size,
PD_ZeroCopyData** output, int* out_size) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors;
if (!predictors.count(config->config.model_dir())) {
......@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
break;
default:
CHECK(false) << "Unsupport data type.";
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unsupported data type."));
break;
}
}
......@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
break;
default:
CHECK(false) << "Unsupport data type.";
PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break;
}
......@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
break;
default:
CHECK(false) << "Unsupport data type.";
PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break;
}
}
......
......@@ -19,6 +19,7 @@
#include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
}
void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.name = std::string(name);
}
void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
}
void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.data = buf->buf;
}
void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.shape.assign(shape, shape + size);
}
const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return tensor->tensor.name.c_str();
}
PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return ConvertToPDDataType(tensor->tensor.dtype);
}
PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
PD_PaddleBuf* ret = PD_NewPaddleBuf();
ret->buf = tensor->tensor.data;
return ret;
}
const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
const std::vector<int>& shape = tensor->tensor.shape;
*size = shape.size();
return shape.data();
......
......@@ -20,8 +20,12 @@
#define LITE_WITH_XPU 1
#endif
#ifndef PADDLE_WITH_ARM
#define LITE_WITH_X86 1
#endif
#include "paddle/fluid/inference/lite/engine.h"
#include "lite/api/paddle_use_passes.h"
#include <utility>
namespace paddle {
namespace inference {
......@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
return engines_.at(name).get() != nullptr;
}
paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
paddle::lite_api::PaddlePredictor* EngineManager::Get(
const std::string& name) const {
return engines_.at(name).get();
}
paddle::lite::Predictor* EngineManager::Create(const std::string& name,
const EngineConfig& cfg) {
if (cfg.valid_places.front().target == TARGET(kCUDA)) {
#ifdef PADDLE_WITH_CUDA
paddle::lite::Env<TARGET(kCUDA)>::Init();
paddle::lite_api::PaddlePredictor* EngineManager::Create(
const std::string& name, const EngineConfig& cfg) {
// config info for predictor.
paddle::lite_api::CxxConfig lite_cxx_config;
lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
cfg.param.c_str(), cfg.param.size());
lite_cxx_config.set_valid_places(cfg.valid_places);
#ifdef PADDLE_WITH_ARM
set_threads.set_threads(cfg.cpu_math_library_num_threads);
#else
lite_cxx_config.set_x86_math_library_num_threads(
cfg.cpu_math_library_num_threads);
#endif
} else if (cfg.valid_places.front().target == TARGET(kXPU)) {
#ifdef PADDLE_WITH_XPU
paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
cfg.xpu_l3_workspace_size;
lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
cfg.xpu_l3_workspace_size);
#endif
}
auto* p = new paddle::lite::Predictor();
p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
cfg.model_type, cfg.model_from_memory);
engines_[name].reset(p);
return p;
// create predictor
std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
engines_[name] = std::move(p);
return engines_[name].get();
}
void EngineManager::DeleteAll() {
for (auto& item : engines_) {
item.second.reset(nullptr);
item.second.reset();
}
}
......
......@@ -23,12 +23,9 @@
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wall"
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_place.h"
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/memory.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/api/paddle_use_passes.h"
#pragma GCC diagnostic pop
namespace paddle {
......@@ -38,25 +35,33 @@ namespace lite {
struct EngineConfig {
std::string model;
std::string param;
paddle::lite::Place prefer_place;
std::vector<paddle::lite::Place> valid_places;
std::vector<paddle::lite_api::Place> valid_places;
std::vector<std::string> neglected_passes;
lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
bool model_from_memory{true};
// for xpu
size_t xpu_l3_workspace_size;
// for x86 or arm
int cpu_math_library_num_threads{1};
// for cuda
bool use_multi_stream{false};
};
class EngineManager {
public:
bool Empty() const;
bool Has(const std::string& name) const;
paddle::lite::Predictor* Get(const std::string& name) const;
paddle::lite::Predictor* Create(const std::string& name,
const EngineConfig& cfg);
paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
paddle::lite_api::PaddlePredictor* Create(const std::string& name,
const EngineConfig& cfg);
void DeleteAll();
private:
std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
std::unordered_map<std::string,
std::shared_ptr<paddle::lite_api::PaddlePredictor>>
engines_;
};
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/lite/tensor_utils.h"
#include <functional>
#include <map>
#include <memory>
#include "paddle/fluid/framework/data_type.h"
......@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
}
}
void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
PrecisionType precision_type,
TargetType target_type) {
void* res{nullptr};
switch (precision_type) {
case PrecisionType::kFloat:
res = static_cast<void*>(src->mutable_data<float>(target_type));
break;
case PrecisionType::kInt8:
res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
break;
case PrecisionType::kInt32:
res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
break;
case PrecisionType::kInt64:
res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported precision type. Now only supports FP32, INT8, INT32 and "
"INT64."));
break;
}
return res;
}
int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
auto shape = tensor.shape();
int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
return numel;
}
void InitDstTensor(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src) {
// Currently, Lite needs to explicitly specify the target type of
// the input tensor.
constexpr int empty_size = 0;
dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
dst->set_precision(GetLitePrecisionType(src.type()));
SetLoD(dst->mutable_lod(), src.lod());
dst->Resize({empty_size});
GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
GetLiteTargetType(src.place()));
dst->SetPrecision(GetLitePrecisionType(src.type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src.lod());
dst->SetLoD(lite_lod);
}
void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
void InitDstTensor(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src) {
constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32;
dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
......@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
}
template <>
void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
void TensorCopyAsync(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src,
const platform::DeviceContext& ctx) {
InitDstTensor(dst, src);
const platform::Place& src_place = src.place();
......@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
dst->Resize(framework::vectorize(src.dims()));
const void* src_data = src.data<void>();
void* dst_data = dst->mutable_data(bytes);
void* dst_data{nullptr};
dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
GetLiteTargetType(src.place()));
VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << src.type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
VLOG(3) << "[Lite memory size] Bytes = " << bytes;
}
template <>
void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
void TensorCopyAsync(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src,
const platform::DeviceContext& ctx) {
dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
dst->Resize(paddle::framework::make_ddim(src.shape()));
InitDstTensor(dst, src);
const platform::Place& src_place = GetNativePlace(src.target());
const platform::Place& dst_place = dst->place();
const size_t bytes =
static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
const void* src_data = src.raw_data();
int64_t src_numel = GetLiteTensorNumel(src);
const size_t bytes = src_numel * framework::SizeOfType(dst->type());
const void* src_data = src.data<void>();
// When Lite is ready, the source type needs to be modified here.
void* dst_data = dst->mutable_data(dst_place, dst->type());
VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << dst->type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
VLOG(3) << "[Lite memory size] Bytes = " << bytes;
}
template <>
void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
const size_t bytes =
static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
dst->Resize(framework::vectorize(src->dims()));
dst->set_precision(GetLitePrecisionType(src->type()));
SetLoD(dst->mutable_lod(), src->lod());
dst->ResetBuffer(buf, bytes);
dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
GetLiteTargetType(src->place()));
dst->SetPrecision(GetLitePrecisionType(src->type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src->lod());
dst->SetLoD(lite_lod);
}
template <>
void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32;
void* src_raw_data = src->raw_data();
void* src_raw_data =
GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
std::shared_ptr<memory::allocation::Allocation> holder(
new memory::allocation::Allocation(src_raw_data, src->memory_size(),
new memory::allocation::Allocation(src_raw_data, memory_size,
GetNativePlace(src->target())));
dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
dst->Resize(paddle::framework::make_ddim(src->shape()));
SetLoD(dst->mutable_lod(), src->lod());
dst->ResetHolderWithType(holder, dtype);
}
......
......@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
config.model_from_memory = true;
config.valid_places = {
#ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
};
LOG(INFO) << "Create EngineManager";
......@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
unique_key),
true);
paddle::lite::Predictor* engine_0 =
paddle::lite_api::PaddlePredictor* engine_0 =
inference::Singleton<inference::lite::EngineManager>::Global().Get(
unique_key);
CHECK_NOTNULL(engine_0);
......
......@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
}
template <typename T>
void test_lite_tensor_data_ptr(PrecisionType precision_type) {
void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
PrecisionType precision_type,
TargetType target_type);
const int count = 4;
paddle::lite::Tensor lite_tensor;
lite_tensor.Resize({count});
auto* lite_tensor_data = lite_tensor.mutable_data<T>();
for (size_t i = 0; i < count; ++i) {
lite_tensor_data[i] = i;
}
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
T* data = static_cast<T*>(GetLiteTensorDataPtr(
&lite_api_tensor, precision_type, TargetType::kHost));
for (size_t i = 0; i < count; ++i) {
CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
}
}
TEST(LiteEngineOp, GetLiteTensorDataPtr) {
test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
}
void test_tensor_copy(const platform::DeviceContext& ctx) {
// Create LoDTensor.
std::vector<float> vector({1, 2, 3, 4});
......@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod);
// Create lite::Tensor and copy.
paddle::lite::Tensor lite_tensor;
TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
// Copy to LoDTensor.
framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync(
......@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod);
// Create lite::Tensor and share.
paddle::lite::Tensor lite_tensor;
TensorDataShare(&lite_tensor, &lod_tensor);
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorDataShare(&lite_api_tensor, &lod_tensor);
// Copy to LoDTensor.
framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector);
......
......@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
void TensorRTEngine::FreezeNetwork() {
freshDeviceId();
VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE(infer_builder_ != nullptr,
"Call InitNetwork first to initialize network.");
PADDLE_ENFORCE_EQ(network() != nullptr, true,
platform::errors::InvalidArgument(
"Call InitNetwork first to initialize network."));
PADDLE_ENFORCE_NOT_NULL(infer_builder_,
platform::errors::InvalidArgument(
"Inference builder of TRT is null. Please make "
"sure you call InitNetwork first."));
PADDLE_ENFORCE_NOT_NULL(network(),
platform::errors::InvalidArgument(
"Call InitNetwork first to initialize network."));
// build engine.
infer_builder_->setMaxBatchSize(max_batch_);
infer_builder_->setMaxWorkspaceSize(max_workspace_);
......@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
} else {
infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
}
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
PADDLE_ENFORCE_NOT_NULL(
infer_engine_, platform::errors::Fatal(
"Build TensorRT cuda engine failed! Please recheck "
"you configurations related to paddle-TensorRT."));
}
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
......@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
platform::errors::InvalidArgument(
"The TRT network should be initialized first."));
auto *input = network()->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
PADDLE_ENFORCE(input->isNetworkInput());
PADDLE_ENFORCE_NOT_NULL(
input, platform::errors::InvalidArgument("Adding input %s failed in "
"TensorRT inference network. "
"Please recheck your input.",
name));
PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
platform::errors::InvalidArgument(
"Input %s is not the input of TRT inference network. "
"Please recheck your input.",
name));
TensorRTEngine::SetITensor(name, input);
return input;
}
......@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string &name) {
auto *output = layer->getOutput(offset);
SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr);
PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output);
PADDLE_ENFORCE(output->isNetworkOutput());
PADDLE_ENFORCE_EQ(
output->isNetworkOutput(), true,
platform::errors::InvalidArgument(
"The output %s of TRT engine should be the output of the network.",
name));
}
void TensorRTEngine::DeclareOutput(const std::string &name) {
auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr);
PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output);
}
void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr);
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
name);
PADDLE_ENFORCE_NOT_NULL(
tensor, platform::errors::InvalidArgument(
"Tensor named %s of TRT engine should not be null.", name));
PADDLE_ENFORCE_EQ(
0, itensor_map_.count(name),
platform::errors::InvalidArgument(
"Tensor named %s of TRT engine should not be duplicated", name));
itensor_map_[name] = tensor;
}
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
platform::errors::NotFound(
"Tensor named %s is not found in TRT engine", name));
return itensor_map_[name];
}
......@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
std::string splitter = "__";
std::string name_with_suffix = name + splitter + name_suffix;
platform::CPUPlace cpu_place;
PADDLE_ENFORCE_EQ(
weight_map.count(name_with_suffix), 0,
"During TRT Op converter: We set weight %s with the same name "
"twice into the weight_map",
name_with_suffix);
PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
platform::errors::AlreadyExists(
"The weight named %s is set into the weight map "
"twice in TRT OP converter.",
name_with_suffix));
weight_map[name_with_suffix].reset(new framework::Tensor());
weight_map[name_with_suffix]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
......@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, count);
PADDLE_ENFORCE_LT(device_id_, count,
platform::errors::OutOfRange(
"Device id %d exceeds the current device count: %d.",
device_id_, count));
cudaSetDevice(device_id_);
}
......
......@@ -196,8 +196,10 @@ class TensorRTEngine {
}
nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE(infer_engine_ != nullptr,
"You should build engine first and then serialize");
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"The TensorRT engine must be built first before serialization"));
ihost_memory_.reset(infer_engine_->serialize());
return ihost_memory_.get();
}
......@@ -222,8 +224,14 @@ class TensorRTEngine {
engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
}
PADDLE_ENFORCE(infer_engine_ != nullptr,
"build cuda engine failed when deserialize engine info.!");
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
}
void SetRuntimeBatch(size_t batch_size);
......
......@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims *input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0);
PADDLE_ENFORCE_EQ(num_inputs, 2);
PADDLE_ENFORCE_NOT_NULL(input_dims);
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"There is only one output in TRT elementwise "
"op plugin, but got output index: %d.",
index));
PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
"There are 2 inputs in TRT elementwise "
"op plugin, but got input number: %d.",
num_inputs));
PADDLE_ENFORCE_NOT_NULL(
input_dims,
platform::errors::InvalidArgument(
"The input dims of TRT elementwise op plugin should not be null."));
return input_dims[0];
}
int ElementWisePlugin::initialize() {
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
platform::errors::InvalidArgument(
"The dimension of input Y of TRT elementwise op plugin "
"should be greater than 0, but got %d.",
dims_y_.nbDims));
axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
int trimed_nb_dims = dims_y_.nbDims;
......@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
}
dims_y_.nbDims = trimed_nb_dims;
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
platform::errors::InvalidArgument(
"We expect [number of x dims] >= [number of y dims + "
"axis] in TRT elementwise op plugin, but got [number "
"of x dims] = %d, [number of y dims + axis] = %d.",
dims_x_.nbDims, dims_y_.nbDims + axis_));
PADDLE_ENFORCE_LT(
axis_, dims_x_.nbDims,
platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
"in TRT elementwise op plugin, but got "
"[axis] = %d, [number of x dims] = %d.",
axis_, dims_x_.nbDims));
prev_size_ = 1;
midd_size_ = 1;
......@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
for (int i = 0; i < dims_y_.nbDims; ++i) {
PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
"Broadcast dimension mismatch.");
platform::errors::InvalidArgument(
"Broadcast dimension mismatch. The dims of input Y "
"should be a subsequence of X."));
midd_size_ *= dims_y_.d[i];
}
......@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
elementwise_kernel<<<block, thread, 0, stream>>>(
num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
} else {
PADDLE_THROW("Not implemented.");
PADDLE_THROW(platform::errors::Unimplemented(
"Paddle-TRT only support elementwise operation: {add, mul} currently, "
"but got %s.",
type_));
}
return cudaGetLastError() != cudaSuccess;
......
......@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
nvinfer1::DimsCHW{1, 1, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y");
LOG(INFO) << "freeze network";
......@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
nvinfer1::DimsCHW{1, 2, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y");
engine_->FreezeNetwork();
......@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
auto *conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(conv_layer,
platform::errors::InvalidArgument(
"TRT convolution layer building failed."));
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
......@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(
pool_layer,
platform::errors::InvalidArgument("TRT pooling layer building failed."));
pool_layer->setStride(nvinfer1::DimsHW{1, 1});
pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
......
......@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
engine_name_, it.first));
}
const auto& d = dataptr->second;
PADDLE_ENFORCE(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
"Fail to cudaMemcpy %s for %s", engine_name_, it.first);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
}
data_is_set_ = true;
......
......@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
### Lexcial analysis GRU model
set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
......
......@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config);
......
......@@ -45,7 +45,9 @@ endif()
SET(OP_HEADER_DEPS xxhash executor)
if (WITH_GPU)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
endif()
endif()
SET(OP_PREFETCH_DEPS "")
......
......@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace operators {
class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
public:
AmpCheckFiniteAndScaleOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
CheckFiniteAndUnscaleOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
"amp_check_finite_and_unscale");
"check_finite_and_unscale");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"amp_check_finite_and_unscale");
"check_finite_and_unscale");
PADDLE_ENFORCE_EQ(
ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
platform::errors::InvalidArgument(
"The input(X) and output(Out) should have same size in "
"Operator(amp_check_finite_and_unscale), size of input(X) is %d "
"Operator(check_finite_and_unscale), size of input(X) is %d "
"and size of output(Out) is %d.",
ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
auto x_dims = ctx->GetInputsDim("X");
......@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensors) The input tensors of amp_check_finite_and_scale operator.")
"(Tensors) The input tensors of check_finite_and_unscale operator.")
.AsDuplicable();
AddInput("Scale",
"(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale "
"(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator.");
AddOutput("Out",
"(Tensors) The scaled output tensor of "
"amp_check_finite_and_unscale operator.")
"check_finite_and_unscale operator.")
.AsDuplicable();
AddOutput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"if there there is infinite or nan item in input X.");
AddComment(R"DOC(
amp_check_finite_and_scale operator.
check_finite_and_unscale operator.
Check if input X contains all finite data, if yes, scale it by input Scale.
$$Out = X * scale$$
$$Out = X / scale$$
If any tensor in X contains Inf or Nan, the Out will generate a indicator.
FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
......@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False).
}
};
template <typename T>
class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
T inverse_scale = Inverse<T>(*scale_data);
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
*found_inf_data = !(*is_finite_data);
}
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
if (!(*found_inf_data)) {
eigen_out.device(dev) = eigen_in * inverse_scale;
} else {
eigen_out.device(dev) = eigen_in * static_cast<T>(0);
}
}
return;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp,
ops::AmpCheckFiniteAndScaleOpMaker,
check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
ops::CheckFiniteAndUnscaleOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
amp_check_finite_and_scale,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
double>);
REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleCpuKernel<float>,
ops::CheckFiniteAndUnscaleCpuKernel<double>);
......@@ -14,28 +14,31 @@ limitations under the License. */
#include <cuda.h>
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
bool* found_inf, T* out) {
__global__ void GpuInverse(const T* s, T* o) {
*o = Inverse<T>(*s);
}
template <typename T>
__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
bool* found_inf, T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) {
if (!isfinite(in[idx])) {
*found_inf = 1;
*found_inf = true;
}
out[idx] = *found_inf ? in[idx] : in[idx] * scale[0];
out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
}
}
template <typename T>
class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
......@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
framework::Tensor inverse_scale =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
T* inverse_scale_v = inverse_scale.template data<T>();
GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
......@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = x->numel();
int block = 512;
int block = 1024;
int grid = (num + block - 1) / block;
VLOG(3) << "launch kernel";
AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, scale_data, num, found_inf_data, out_data);
CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, inverse_scale_v, num, found_inf_data, out_data);
VLOG(3) << "finish kernel";
}
}
......@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
amp_check_finite_and_scale,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
double>);
REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleGpuKernel<float>,
ops::CheckFiniteAndUnscaleGpuKernel<double>);
......@@ -16,51 +16,16 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/isfinite_op.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
if (*is_finite_data) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
eigen_out.device(dev) = (*scale_data) * eigen_in;
} else {
*found_inf_data = true;
break;
}
}
}
return;
}
};
template <typename T>
inline HOSTDEVICE T Inverse(T s) {
return 1.0 / s;
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UpdateLossScalingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
"update_loss_scaling");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->SetOutputDim("LossScaling", {1});
ctx->SetOutputDim("OutGoodSteps", {1});
ctx->SetOutputDim("OutBadSteps", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
ctx.device_context());
}
};
class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensors) The input tensors of update_loss_scaling operator.")
.AsDuplicable();
AddInput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"whether there is any infinite gradient.");
AddInput("PrevLossScaling",
"(Tensor) 1-dim tensor, previous loss scaling.");
AddInput("InGoodSteps",
"(Tensor) 1-dim tensor, accumulates good steps in which all "
"gradients are finite.");
AddInput("InBadSteps",
"(Tensor) 1-dim tensor, accumulates bad steps in which some "
"gradients are infinite.");
AddOutput("Out",
"(Tensors) The output tensor of update_loss_scaling operator.")
.AsDuplicable();
AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
AddAttr<int>("incr_every_n_steps",
"A value represents increasing loss scaling every n "
"consecutive steps with finite gradients.");
AddAttr<int>("decr_every_n_nan_or_inf",
"A value represents decreasing loss scaling every n "
"accumulated steps with nan or inf gradients.");
AddAttr<float>("incr_ratio",
"The multiplier to use when increasing the loss scaling.")
.AddCustomChecker([](float incr_ratio) {
PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be greater than 1, but "
"the received is %f",
incr_ratio));
});
AddAttr<float>(
"decr_ratio",
"The less-than-one-multiplier to use when decreasing loss scaling.")
.AddCustomChecker([](float decr_ratio) {
PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be between 0 and 1, but "
"the received is %f",
decr_ratio));
});
AddComment(R"DOC(
Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite.
)DOC");
}
};
template <typename T>
class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data,
bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
if (*found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
std::memset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(
update_loss_scaling, ops::UpdateLossScalingOp,
ops::UpdateLossScalingOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<CPU, float>,
ops::UpdateLossScalingKernel<CPU, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void GpuUpdateLossScaling(
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
template <typename T>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
bool has_inf{false};
memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
sizeof(bool), dev_ctx.stream());
if (has_inf) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
cudaMemset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using GPU = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<GPU, float>,
ops::UpdateLossScalingKernel<GPU, double>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
HOSTDEVICE void Update(const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data,
int* bad_out_data) {
if (*found_inf_data) {
*good_out_data = 0;
*bad_out_data = *bad_in_data + 1;
if (*bad_out_data == decr_every_n_nan_or_inf) {
T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
*updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
? static_cast<T>(1)
: new_loss_scaling;
*bad_out_data = 0;
}
} else {
*bad_out_data = 0;
*good_out_data = *good_in_data + 1;
if (*good_out_data == incr_every_n_steps) {
T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
*updated_loss_scaling_data = std::isfinite(new_loss_scaling)
? new_loss_scaling
: *pre_loss_scaling_data;
*good_out_data = 0;
}
}
}
template <typename DeviceContext, typename T>
class UpdateLossScalingFunctor {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const;
};
template <typename DeviceContext, typename T>
class LazyZeroInputs {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const;
};
template <typename DeviceContext, typename T>
class UpdateLossScalingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
platform::errors::InvalidArgument(
"FoundInfinite must has only one element."));
const bool* found_inf_data = found_inf->data<bool>();
const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
const int* good_in_data = good_in->data<int>();
const int* bad_in_data = bad_in->data<int>();
auto& dev_ctx = ctx.template device_context<DeviceContext>();
T* updated_loss_scaling_data =
updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
const int decr_every_n_nan_or_inf =
ctx.Attr<int>("decr_every_n_nan_or_inf");
const float incr_ratio = ctx.Attr<float>("incr_ratio");
const float decr_ratio = ctx.Attr<float>("decr_ratio");
UpdateLossScalingFunctor<DeviceContext, T>{}(
dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
}
};
} // namespace operators
} // namespace paddle
......@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place
bool force_cpu = ctx.Attr<bool>("force_cpu");
kt.place_ = force_cpu ? platform::CPUPlace()
: ctx.Input<framework::LoDTensor>("X")->place();
if (force_cpu) {
kt.place_ = platform::CPUPlace();
} else {
if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
typeid(platform::CUDAPinnedPlace)) {
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
} else {
kt.place_ = ctx.GetPlace();
}
}
return kt;
}
};
......
......@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
set(TMPDEPS memory)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
set(TMPDEPS memory cub)
endif()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
......
......@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
} else if (p == INFINITY || p == -INFINITY) {
// p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
// j!=i, or equals to sign(z_i) * dout if j=i.
grad_t.device(place) =
(x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
sign * out_grad_t.broadcast(out_bcast_dims);
if (platform::is_cpu_place(context.GetPlace())) {
grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
.template cast<T>() *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
.template cast<T>() *
sign * out_grad_t.broadcast(out_bcast_dims);
}
} else {
// dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
out_grad_t.broadcast(out_bcast_dims);
if (platform::is_cpu_place(context.GetPlace())) {
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
out_grad_t.broadcast(out_bcast_dims);
}
}
Eigen::DSizes<int, Rank * 2> x_reshape_dims;
......
......@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
REGISTER_OP_CPU_KERNEL(
elementwise_floordiv,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
int64_t>);
......@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_floordiv,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
......@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
......@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) {
int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims();
auto y_dims = y->dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
}
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
}
template <typename DeviceContext, typename T>
......
......@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
using mkldnn::memory;
auto CanMKLDNNElementwiseMulBeUsed = [&]() {
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto y_dims = ctx.Input<Tensor>("Y")->dims();
int rankdiff = x_dims.size() - y_dims.size();
// TODO(jczaja): Remove this when oneDNN performance for scalar
// broadcasting
// is improved (Ernie large situation)
if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
return false;
}
return true;
};
if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
if (platform::CanMKLDNNBeUsed(ctx)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("ShapeTensor",
"(Tensor<int>), optional). The shape of the output."
"It has a higher priority than Attr(shape).")
.AsDispensable();
AddInput("ShapeTensorList",
"(vector<Tensor<int>>, optional). The shape of the output. "
"It has a higher priority than Attr(shape)."
"The shape of the element in vector must be [1].")
.AsDuplicable()
.AsDispensable();
AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output")
.SetDefault({});
AddAttr<int>("dtype", "The data type of output tensor, Default is float")
.SetDefault(framework::proto::VarType::FP32);
AddOutput("Out", "(Tensor) The output tensor.");
AddComment(R"DOC(empty operator
Returns a tensor filled with uninitialized data. The shape of the tensor is
defined by the variable argument shape.
The type of the tensor is specify by `dtype`.
)DOC");
}
};
class EmptyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* context) const override {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
if (context->HasInput("ShapeTensor")) {
auto dims = context->GetInputDim("ShapeTensor");
int num_ele = 1;
for (int i = 0; i < dims.size(); ++i) {
num_ele *= dims[i];
}
context->SetOutputDim("Out", framework::make_ddim({num_ele}));
} else if (context->HasInputs("ShapeTensorList")) {
std::vector<int> out_dims;
auto dims_list = context->GetInputsDim("ShapeTensorList");
for (size_t i = 0; i < dims_list.size(); ++i) {
auto& dims = dims_list[i];
PADDLE_ENFORCE_EQ(
dims, framework::make_ddim({1}),
"ShapeError: The shape of Tensor in list must be [1]. "
"But received the shape "
"is [%s]",
dims);
out_dims.push_back(dims[0]);
}
context->SetOutputDim("Out", framework::make_ddim(out_dims));
} else {
auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
context->SetOutputDim("Out", framework::make_ddim(shape));
}
}
protected:
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& context) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(context.Attr<int>("dtype")),
context.GetPlace());
}
};
class EmptyOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* context) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, context->GetAttr("dtype")));
context->SetOutputDataType("Out", data_type);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
ops::EmptyKernel<plat::CPUDeviceContext, int>,
ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
ops::EmptyKernel<plat::CPUDeviceContext, float>,
ops::EmptyKernel<plat::CPUDeviceContext, double>,
ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
ops::EmptyKernel<plat::CUDADeviceContext, int>,
ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
ops::EmptyKernel<plat::CUDADeviceContext, float>,
ops::EmptyKernel<plat::CUDADeviceContext, double>,
ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class EmptyKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto dtype = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
Tensor *out_tensor = context.Output<Tensor>("Out");
auto shape = GetShape(context);
out_tensor->Resize(shape);
out_tensor->mutable_data(context.GetPlace(), dtype);
}
};
} // namespace operators
} // namespace paddle
......@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
template <typename T>
class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_times_tensor")) {
op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
}
if (this->HasInput("ExpandTimes")) {
op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
}
op->SetAttrMap(this->Attrs());
op->SetType("expand");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
} // namespace operators
......@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
template <typename T>
class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("expand_v2");
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_shapes_tensor")) {
op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
}
if (this->HasInput("Shape")) {
op->SetInput("Shape", this->Input("Shape"));
}
op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
} // namespace operators
......@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandV2GradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -27,27 +27,6 @@ namespace operators {
using Tensor = framework::Tensor;
inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
std::string op_type) {
// 1. shape is a Tensor
if (ctx.HasInput("ShapeTensor")) {
auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
auto vec_shape = GetDataFromTensor<int>(shape_tensor);
return framework::make_ddim(vec_shape);
}
// 2. shape is a list/tuple containing Tensor
auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
if (shape_tensor_list.size() > 0) {
auto vec_shape = GetDataFromTensorList(shape_tensor_list);
return framework::make_ddim(vec_shape);
}
// 3. shape is a list/tuple without containing Tensor
auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
return framework::make_ddim(vec_shape);
}
template <typename T>
class FillConstantKernel : public framework::OpKernel<T> {
public:
......@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
}
value = tensor_data[0];
}
const std::string op_type = "fill_constant";
auto shape = GetShape(ctx, op_type);
auto shape = GetShape(ctx);
if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>();
......
......@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number);
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
PADDLE_ENFORCE_LT(
ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T));
}
......@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number);
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
PADDLE_ENFORCE_LT(
ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T));
}
......
......@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2,
platform::errors::InvalidArgument(
"Input(X)'s rank must be 2, but received input dim "
"size is:%d, input dim is:[%s]",
x_dims.size(), x_dims));
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
? framework::flatten_to_2d(x_dims, 1)
: x_dims;
PADDLE_ENFORCE_EQ(
x_mat_dims.size(), 2,
platform::errors::InvalidArgument("The size of input X dims should be 2, "
"or 3 with second dimension equal to "
"1, but now Input X dim is:[%s] ",
x_dims));
auto wx_dims = ctx->GetInputDim("WeightX");
PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
......@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"The rank of Input(WeightX) should be 2, but received "
"WeightX dim size is:%d, WeightX dim is:[%s] ",
wx_dims.size(), wx_dims));
PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
platform::errors::InvalidArgument(
"The first dimension of Input(WeightX) "
"should equal to second dimension of input x, but "
"received WeightX dimension is:%d, x dimension is:%d",
wx_dims[0], x_dims[1]));
PADDLE_ENFORCE_EQ(
wx_dims[0], x_mat_dims[1],
platform::errors::InvalidArgument(
"The first dimension of flattened WeightX"
"should equal to last dimension of flattened input X, but "
"received fattened WeightX dimension is:%d, flattened X dimension "
"is:%d",
wx_dims[0], x_mat_dims[1]));
int frame_size = wx_dims[1] / 3;
auto wh_dims = ctx->GetInputDim("WeightH");
......@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"received bias dim is:[%s], frame size is:%d",
b_dims, frame_size));
}
framework::DDim out_dims({x_dims[0], frame_size});
framework::DDim out_dims({x_mat_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->ShareLoD("X", "Hidden");
int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
"fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
"fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
"fusion_gru");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
}
......@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() {
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "int8", "bfloat16"});
AddAttr<float>("Scale_data",
"Scale to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Shift_data",
"Shift to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(0.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddComment(R"DOC(
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
......@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
}
}
#define INIT_BASE_DEFINES \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto x_lod = x->lod(); \
auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_dims[0]; \
#define INIT_BASE_DEFINES \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto x_lod = x->lod(); \
auto x_dims = x->dims(); /* T x M*/ \
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) \
? framework::flatten_to_2d(x_dims, 1) \
: x_dims; \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_mat_dims[0]; \
const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \
......@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int M = x_mat_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const jit::gru_attr_t attr( \
......
......@@ -21,11 +21,12 @@ namespace operators {
using paddle::framework::LoDTensor;
using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CreateKey;
using paddle::platform::MKLDNNGetDataType;
using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast;
template <typename T>
template <typename T, typename T_out = T>
class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
public:
GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
......@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
const std::string& unique_name)
: platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(unique_name, Ti)),
CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
N(N),
Ti(Ti),
IC(IC),
......@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// do not depend on Ti size but primitive and input/output memory do
if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
memory_key_ = unique_name;
memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
} else {
memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
platform::ThreadIDasStr());
}
// Is it int8 kernel
const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (is_INT8) {
// Int8 attributes
const float scale_data = ctx.Attr<float>("Scale_data");
const float shift_data = ctx.Attr<float>("Shift_data");
const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
const int weights_scale_mask =
0 +
(1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
+
(1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
attr_.set_rnn_data_qparams(scale_data, shift_data);
attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
}
if (!this->isCached()) {
......@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
platform::errors::Unimplemented(
"oneDNN fusion_gru supports only tanh as an activation."));
// Weights for int8 kernel are of a type s8
const auto weights_dt =
is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
// oneDNN RNN dimensions
const int64_t D = 1; // Directions
const int64_t L = 1; // Layers (PP supports only 1 stacked layer)
......@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// Create memory descriptors
auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any);
auto weight_x_md = MKLDNNMemDesc(
{L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto weight_h_md = MKLDNNMemDesc(
{L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto weight_x_md =
MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto weight_h_md =
MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo);
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
MKLDNNMemoryFormat::any);
auto h0_md = dnnl::memory::desc();
if (h0) {
h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
}
auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
// Create GRU oneDNN primitive
const auto direction =
......@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
: dnnl::rnn_direction::unidirectional_left2right;
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
}
}
......@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
dnnl::memory::format_tag::ntc);
}
void reorderRNNdata(const T* input_data, T* output_data,
void reorderRNNdata(void* input_data, void* output_data,
std::vector<size_t> lod, const bool is_reverse,
platform::RNNReorderType reorder_type) {
switch (reorder_type) {
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case platform::RNNReorderType::PP_NTC: {
auto* input_data_iter = input_data;
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * IC;
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
memcpy(output_data + n * Ti * IC + offset, input_data_iter,
memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
sizeof(T) * num_elements);
input_data_iter += num_elements;
}
} break;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case platform::RNNReorderType::PP_TNC: {
auto* input_data_iter = input_data;
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]);
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data + (t + offset) * N * IC + n * IC,
memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
input_data_iter, sizeof(T) * IC);
input_data_iter += IC;
}
......@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
} break;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case platform::RNNReorderType::NTC_PP: {
auto* output_data_iter = output_data;
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * OC;
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
memcpy(output_data_iter, input_data + n * Ti * OC + offset,
sizeof(T) * num_elements);
memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
sizeof(T_out) * num_elements);
output_data_iter += num_elements;
}
} break;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case platform::RNNReorderType::TNC_PP: {
auto* output_data_iter = output_data;
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = lod[n + 1] - lod[n];
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter,
input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
input_data_iter + (t + offset) * N * OC + n * OC,
sizeof(T_out) * OC);
output_data_iter += OC;
}
}
......@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
}
const auto& input_lod = input->lod()[0];
auto* x_data = input->data<T>();
auto* x_data = to_void_cast(input->data<T>());
auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
auto* x_onednn_data = memory_p->get_data_handle();
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
......@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
return memory_p;
}
// TODO(grygielski) H0 is for now persistable
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
auto* h0_data = to_void_cast(h0->data<T>());
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
auto user_h0_memory = dnnl::memory();
if (h0) {
user_h0_memory =
dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(h0->data<float>()));
} else {
user_h0_memory = dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_);
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
}
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_h0_memory, *memory_p, attr_)
.execute(astream, user_h0_memory, *memory_p);
this->dev_ctx_.SetBlob(h0_key, memory_p);
} else {
memory_p->set_data_handle(h0_data);
}
return memory_p;
}
......@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_layer_desc(), this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p)
dnnl::reorder(user_memory, *memory_p, attr_)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wx_key, memory_p);
......@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_iter_desc(), this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p)
dnnl::reorder(user_memory, *memory_p, attr_)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wh_key, memory_p);
......@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std::string memory_key_;
dnnl::primitive_attr attr_;
};
template <typename T>
class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const bool is_INT8 = std::is_same<T, uint8_t>::value;
const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
// TODO(grygielski) Add option for bfloat
if (!is_INT8 || force_fp32_output) {
RunKernel<float>(ctx);
} else {
RunKernel<uint8_t>(ctx);
}
}
template <typename Tout = T>
void RunKernel(const framework::ExecutionContext& ctx) const {
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
......@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
const auto* weight_h = ctx.Input<Tensor>("WeightH");
const auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden = ctx.Output<LoDTensor>("Hidden");
auto x_dims = input->dims();
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
? framework::flatten_to_2d(x_dims, 1)
: x_dims;
// Get attributes
const bool is_reverse = ctx.Attr<bool>("is_reverse");
const bool origin_mode = ctx.Attr<bool>("origin_mode");
// Get tensor dimensions
const auto x_dims = framework::vectorize(input->dims());
const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
const auto weight_h_dims = framework::vectorize(weight_h->dims());
const auto& input_lod = input->lod()[0];
......@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
}
return res;
}();
const int64_t IC = x_dims[1]; // Input channels
const int64_t OC = weight_h_dims[0]; // Output channels
const int64_t IC = x_mat_dims_vec[1]; // Input channels
const int64_t OC = weight_h_dims[0]; // Output channels
GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
input, weight_h, h0, is_reverse, N, Ti, IC, OC,
ctx.InputName("X") + ctx.InputName("WeightH"));
GRUMKLDNNHandler<T, Tout> handler(
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
is_reverse, N, Ti, IC, OC,
ctx.InputName("X") + ctx.InputName("WeightH"));
auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto h0_memory_p = handler.AcquireH0Memory(h0);
auto weight_x_memory_p =
handler.AcquireWeightXMemory(weight_x, origin_mode);
auto weight_h_memory_p =
......@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
std::unordered_map<int, dnnl::memory> gru_args = {
{DNNL_ARG_SRC_LAYER, *input_memory_p},
{DNNL_ARG_SRC_ITER, *h0_memory_p},
{DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
{DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
{DNNL_ARG_BIAS, *bias_memory_p},
{DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
if (h0) {
auto h0_memory_p = handler.AcquireH0Memory(h0);
gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
}
auto gru_forward_p = handler.AcquireForwardPrimitive();
dnnl::stream astream(mkldnn_engine);
gru_forward_p->execute(astream, gru_args);
astream.wait();
auto* hidden_onednn_data =
reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
auto* hidden_data =
to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
if (handler.is_NTC()) {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::NTC_PP);
......@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
ops::FusionGRUMKLDNNKernel<float>);
ops::FusionGRUMKLDNNKernel<float>,
ops::FusionGRUMKLDNNKernel<uint8_t>);
......@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std);
const std::string op_type = "gaussian_random";
auto shape = GetShape(context, op_type);
auto shape = GetShape(context);
tensor->Resize(shape);
int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace());
......
......@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
const std::string op_type = "gaussian_random";
auto shape = GetShape(context, op_type);
auto shape = GetShape(context);
tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace());
......
......@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
private:
std::vector<std::string> in_names_;
std::vector<std::string> out_names_;
paddle::lite::Predictor *engine_;
paddle::lite_api::PaddlePredictor *engine_;
framework::proto::VarType::Type precision_;
bool use_gpu_;
bool zero_copy_;
......@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
framework::LoDTensor src_t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope,
in_names_[i]);
paddle::lite::Tensor *dst_t = engine_->GetInput(i);
paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
<< engine_->GetInputNames()[i] << ")";
inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
}
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_place)) {
......@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
engine_->Run();
VLOG(3) << "lite engine run done";
for (size_t i = 0; i < out_names_.size(); i++) {
paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
framework::LoDTensor *dst_t =
&inference::analysis::GetFromScope<framework::LoDTensor>(
scope, out_names_[i]);
......
......@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
inference::lite::EngineConfig config;
config.valid_places = {
#ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
};
serialize_params(&(config.param), &scope, repetitive_params);
config.model = program.Proto()->SerializeAsString();
......
......@@ -9,7 +9,11 @@ function(math_library TARGET)
set(hip_srcs)
set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU)
list(APPEND math_common_deps cub)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif()
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
......
......@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
PADDLE_ENFORCE_EQ(
vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input"
" tensor shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
......
......@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
PADDLE_ENFORCE_EQ(
vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(
out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input tensor"
" shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
int blocks = 512;
int grids = (input.numel() + blocks - 1) / blocks;
RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
......@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), size);
PADDLE_ENFORCE_EQ(vector->numel(), size,
platform::errors::InvalidArgument(
"The size of input vector"
" should be equal to the size of input tensor column"
" dimension. Expected vector size=%d, but received %d",
size, vector->numel()));
framework::Tensor one;
one.mutable_data<double>({in_dims[0]}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set;
......@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of input vector"
" should be equal to the size of input tensor row"
" dimension. Expected vector size=%d, but received %d",
in_dims[0], vector->numel()));
framework::Tensor one;
one.mutable_data<double>({size}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set;
......
......@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
framework::Tensor* out) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(out->numel(), size);
PADDLE_ENFORCE_EQ(out->numel(), size,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor column"
" dimension. Expected output size=%d, but received %d",
size, out->numel()));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
......@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
auto& in_dims = input.dims();
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
PADDLE_ENFORCE_EQ(
out->numel(), size,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor column"
" dimension. Expected output size=%d, but received %d",
size, out->numel()));
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
......@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
in_dims[0], out->numel()));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
......@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
PADDLE_ENFORCE_EQ(
out->numel(), height,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
height, out->numel()));
auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
......@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
in_dims[0], out->numel()));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
......@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
PADDLE_ENFORCE_EQ(
out->numel(), height,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
height, out->numel()));
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
......
......@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
auto* ctx = new paddle::platform::CPUDeviceContext();
paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
paddle::platform::errors::InvalidArgument(
"Each value of input"
"tensor should be 10, but received %d.",
t.data<int>()[i]));
}
delete ctx;
}
......
......@@ -18,7 +18,12 @@
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
const std::vector<float>& data) {
PADDLE_ENFORCE_EQ(size, data.size());
PADDLE_ENFORCE_EQ(
size, data.size(),
paddle::platform::errors::InvalidArgument(
"The size of argument data should"
" be equal to the argument size. Expected %d, but received %d.",
size, data.size()));
for (size_t i = 0; i < data.size(); ++i) {
in_ptr[i] = paddle::platform::float16(data[i]);
}
......
......@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more"
" than 6 dimensions currently."));
}
}
......@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more"
" than 6 dimensions currently."));
}
}
......
......@@ -19,6 +19,8 @@ limitations under the License. */
#include <random>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace math {
......@@ -31,7 +33,10 @@ namespace math {
class Sampler {
public:
explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
// PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
"Range should be"
" greater than 0, but recevied %d.",
range));
if (seed == 0) {
std::random_device r;
seed_ = r();
......
......@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
const framework::SelectedRows& input2,
framework::SelectedRows* output) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height());
PADDLE_ENFORCE_EQ(
in1_height, input2.height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2.height()));
output->set_height(in1_height);
auto& in1_rows = input1.rows();
......@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
PADDLE_ENFORCE_EQ(
in1_row_numel, in2_value.numel() / in2_rows.size(),
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, in2_value.numel() / in2_rows.size()));
PADDLE_ENFORCE_EQ(
in1_row_numel, out_value->numel() / out_rows.size(),
platform::errors::InvalidArgument(
"The input and oupput width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, out_value->numel() / out_rows.size()));
auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto in2_place = input2.place();
PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_cpu_place(out_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>();
......@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height, out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But recieved input height = [%d], output height = [%d]",
in1_height, out_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel, output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, output->numel() / in1_height));
SetConstant<platform::CPUDeviceContext, T> functor;
functor(context, output, 0.0);
......@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
const int64_t input2_offset,
framework::SelectedRows* input2) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
PADDLE_ENFORCE_EQ(
in1_height, input2->height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2->height()));
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
......@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
in2_rows.Extend(in1_rows.begin(), in1_rows.end());
auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto in2_place = input2->place();
PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
......@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
PADDLE_ENFORCE_EQ(in1_height, input2->height(),
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But recieved first input height = [%d], second "
"input height = [%d]",
in1_height, input2->height()));
}
// concat rows
std::vector<int64_t> in2_rows;
......@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
}
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
......@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
platform::errors::InvalidArgument(
"All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
platform::errors::InvalidArgument(
"All inputs should have same height."));
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
......@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
platform::errors::InvalidArgument(
"All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
platform::errors::InvalidArgument(
"All input should have same height."));
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
......@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
......
......@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
const framework::SelectedRows& input2,
framework::SelectedRows* output) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height());
PADDLE_ENFORCE_EQ(
in1_height, input2.height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2.height()));
output->set_height(in1_height);
framework::Vector<int64_t> in1_rows(input1.rows());
......@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
PADDLE_ENFORCE_EQ(
in1_row_numel, in2_value.numel() / in2_rows.size(),
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, in2_value.numel() / in2_rows.size()));
PADDLE_ENFORCE_EQ(
in1_row_numel, out_value->numel() / out_rows.size(),
platform::errors::InvalidArgument(
"The input and oupput width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, out_value->numel() / out_rows.size()));
auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto in2_place = input2.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto out_place = context.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
......@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But recieved first input height = [%d], first input height = [%d]",
in1_height, in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height, out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But recieved input height = [%d], output height = [%d]",
in1_height, out_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel, output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, output->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>();
......@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
const int64_t input2_offset,
framework::SelectedRows* input2) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
PADDLE_ENFORCE_EQ(
in1_height, input2->height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2->height()));
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
......@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
}
auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto in2_place = input2->place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
......@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>();
......@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
platform::errors::InvalidArgument(
"All input should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
platform::errors::InvalidArgument(
"All input should have same height."));
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
......@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
auto in1_height = merged_in1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = merged_in1.value();
auto& in1_rows = merged_in1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.template data<T>();
auto* in2_data = input2->data<T>();
......
......@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0);
PADDLE_ENFORCE(cudaDeviceSynchronize());
PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
paddle::platform::errors::PreconditionNotMet(
"The all synchronization on the cuda is error!"));
std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
......
......@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col->dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol.dims().size(), 4,
platform::errors::InvalidArgument("The dimension of"
" vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(
col->dims().size(), 7,
platform::errors::InvalidArgument("The dimension of"
"col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
......@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
const T* vol_data = vol.data<T>();
T* col_data = col->data<T>();
......@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col.dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol->dims().size(), 4,
platform::errors::InvalidArgument("The dimension of vol"
" should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(
col.dims().size(), 7,
platform::errors::InvalidArgument("The dimension of col"
" should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
......@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d)"
" and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d)"
" and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d)"
" and output_width(%d) are mismatching.",
input_width_tmp, output_width));
T* vol_data = vol->data<T>();
const T* col_data = col.data<T>();
......
......@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col->dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol.dims().size(), 4,
platform::errors::InvalidArgument("The dimension of"
" vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(
col->dims().size(), 7,
platform::errors::InvalidArgument("The dimension of"
"col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
......@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
int num_outputs =
input_channels * output_depth * output_height * output_width;
......@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col.dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol->dims().size(), 4,
platform::errors::InvalidArgument("The dimension of vol"
" should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(
col.dims().size(), 7,
platform::errors::InvalidArgument("The dimension of col"
" should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
......@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d)"
" and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d)"
" and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d)"
" and output_width(%d) are mismatching.",
input_width_tmp, output_width));
int num_kernels = input_channels * input_depth * input_height * input_width;
......
......@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
const std::string op_type = "gaussian_random";
auto shape = GetShape(context, op_type);
auto shape = GetShape(context);
tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace());
int64_t size = tensor->numel();
......
......@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("lars_weight_decay",
"(float, default 0.0005) LARS weight decay")
.SetDefault(0.0005);
AddAttr<float>("epsilon",
"(float, default 0.0) epsilon to avoid Division by Zero.")
.SetDefault(0.0);
AddComment(R"DOC(
Lars Momentum Optimizer.
......
......@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu,
const int64_t num, const T lars_coeff,
const T lars_weight_decay, const T* p_norm,
const T* g_norm, T* p_out, T* v_out) {
const T* g_norm, T* p_out, T* v_out,
const T epsilon) {
T lr = learning_rate[0];
T local_lr = learning_rate[0];
CUDA_KERNEL_LOOP(i, num) {
if (p_norm[0] > 0 && g_norm[0] > 0) {
if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]);
(g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
}
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
v_out[i] = v_new;
p_out[i] = p[i] - v_new;
......@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto* p = param->data<T>();
auto* v = velocity->data<T>();
......@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
eg_norm.device(*place) = eigen_g.square().sum().sqrt();
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
p_norm_data, g_norm_data, p_out, v_out);
p_norm_data, g_norm_data, p_out, v_out, epsilon);
}
};
......
......@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
......@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt();
T local_lr = lr[0];
if (ep_norm(0) > 0 && eg_norm(0) > 0) {
if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0));
(eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
}
v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out;
......
include(operators)
if(WITH_GPU)
register_operators(DEPS cub)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
register_operators(DEPS cub)
else()
register_operators()
endif()
else()
register_operators()
endif()
......@@ -24,5 +28,9 @@ if(WITH_GPU)
endif()
if(WITH_GPU)
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
else()
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
endif()
endif()
......@@ -13,18 +13,138 @@
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
#include <memory>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
namespace paddle {
namespace operators {
class LogsumexpOpMaker : public ops::ReduceOpMaker {
protected:
virtual std::string GetName() const { return "logsumexp"; }
virtual std::string GetOpType() const { return "Reduce logsumexp"; }
class LogsumexpOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 4,
platform::errors::InvalidArgument(
"The input tensor X's dimensions of logsumexp "
"should be less equal than 4. But received X's "
"dimensions = %d, X's shape = [%s].",
x_rank, x_dims));
auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
PADDLE_ENFORCE_GT(
axis.size(), 0,
platform::errors::InvalidArgument(
"The size of axis of logsumexp "
"should be greater than 0. But received the size of axis "
"of logsumexp is %d.",
axis.size()));
for (size_t i = 0; i < axis.size(); i++) {
PADDLE_ENFORCE_LT(
axis[i], x_rank,
platform::errors::InvalidArgument(
"axis[%d] should be in the "
"range [-dimension(X), dimension(X)] "
"where dimesion(X) is %d. But received axis[i] = %d.",
i, x_rank, axis[i]));
PADDLE_ENFORCE_GE(
axis[i], -x_rank,
platform::errors::InvalidArgument(
"axis[%d] should be in the "
"range [-dimension(X), dimension(X)] "
"where dimesion(X) is %d. But received axis[i] = %d.",
i, x_rank, axis[i]));
if (axis[i] < 0) {
axis[i] += x_rank;
}
}
bool keepdim = ctx->Attrs().Get<bool>("keepdim");
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
auto dims_vector = vectorize(x_dims);
if (reduce_all) {
if (keepdim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keepdim) {
for (size_t i = 0; i < axis.size(); ++i) {
dims_vector[axis[i]] = 1;
}
} else {
const int kDelFlag = -1;
for (size_t i = 0; i < axis.size(); ++i) {
dims_vector[axis[i]] = kDelFlag;
}
dims_vector.erase(
std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
if (!keepdim && dims_vector.size() == 0) {
dims_vector.push_back(1);
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (axis.size() > 0 && axis[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 4 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"axis",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keepdim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
logsumexp Operator.
This operator computes the logsumexp of input tensor along the given axis.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
)DOC"));
}
};
class LogsumexpGrapOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
"Out@GRAD", "logsumexp");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
template <typename T>
......@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("logsumexp_grad");
op->SetInput("X", this->Input("X"));
......@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
namespace ops = paddle::operators;
REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);
REGISTER_OP_CPU_KERNEL(logsumexp,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::LogsumexpFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::LogsumexpFunctor>);
REGISTER_OP_CPU_KERNEL(
logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::LogsumexpGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
ops::LogsumexpGradFunctor>);
logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
logsumexp_grad,
ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
REGISTER_OP_CUDA_KERNEL(logsumexp,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::LogsumexpFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::LogsumexpFunctor>);
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -14,11 +14,20 @@
#pragma once
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
namespace paddle {
namespace operators {
#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, LogsumexpFunctor>( \
context.template device_context<DeviceContext>(), *input, output, \
axis, keepdim); \
}
struct LogsumexpFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
......@@ -54,5 +63,106 @@ struct LogsumexpGradFunctor {
}
};
template <typename DeviceContext, typename OutT>
class LogsumexpKernel : public framework::OpKernel<OutT> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<OutT>(context.GetPlace());
auto axis = context.Attr<std::vector<int>>("axis");
auto keepdim = context.Attr<bool>("keepdim");
auto reduce_all = context.Attr<bool>("reduce_all");
const auto& input_dim_size = input->dims().size();
// The dims has full dim, set the reduce_all is True
reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
if (reduce_all) {
// Flatten and reduce 1-D tensor
auto x = EigenVector<OutT>::Flatten(*input);
auto out = EigenScalar<OutT>::From(*output);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto reduce_dim = Eigen::array<int, 1>({{0}});
LogsumexpFunctor()(place, &x, &out, reduce_dim);
} else {
int ndim = input_dim_size;
int rdim = axis.size();
// comments for accelerating compiling temporarily.
// HANDLE_DIM(6, 5);
// HANDLE_DIM(6, 4);
// HANDLE_DIM(6, 3);
// HANDLE_DIM(6, 2);
// HANDLE_DIM(6, 1);
// HANDLE_DIM(5, 4);
// HANDLE_DIM(5, 3);
// HANDLE_DIM(5, 2);
// HANDLE_DIM(5, 1);
HANDLE_DIM(4, 3);
HANDLE_DIM(4, 2);
HANDLE_DIM(4, 1);
HANDLE_DIM(3, 2);
HANDLE_DIM(3, 1);
HANDLE_DIM(2, 1);
}
}
};
template <typename DeviceContext, typename T>
class LogsumexpGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<Tensor>("X");
auto* output = context.Input<Tensor>("Out");
auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(context.GetPlace());
auto axis = context.Attr<std::vector<int>>("axis");
auto reduce_all = context.Attr<bool>("reduce_all");
const auto input_dim_size = context.Input<Tensor>("X")->dims().size();
reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
if (reduce_all) {
auto x = EigenVector<T>::Flatten(*input);
auto y = EigenVector<T>::Flatten(*output);
auto dy = EigenVector<T>::Flatten(*output_grad);
auto dx = EigenVector<T>::Flatten(*input_grad);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto broadcast_dim =
Eigen::array<int, 1>({{static_cast<int>(input->numel())}});
LogsumexpGradFunctor()(place, &x, &y, &dx, &dy, broadcast_dim,
broadcast_dim[0]);
} else {
int rank = input->dims().size();
switch (rank) {
case 1:
ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
break;
case 2:
ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
break;
case 3:
ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
break;
case 4:
ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
break;
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -15,8 +15,9 @@
// .part used to speed up nvcc compile
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::LogsumexpGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::LogsumexpGradFunctor>);
logsumexp_grad,
ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -27,9 +27,6 @@ class RunProgramOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
platform::errors::NotFound(
"Input(X) of RunProgramOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
platform::errors::NotFound(
"Input(Params) of RunProgramOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
platform::errors::NotFound(
"Output(Out) of RunProgramOp should not be null."));
......@@ -73,7 +70,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<LoDTensor or SelecetedRows>)"
"The input parameter of RunProgram operator, also the parameters "
"of the loaded program.")
.AsDuplicable();
.AsDuplicable()
.AsDispensable();
AddOutput("Out",
"(vector<LoDTensor>)"
"The output tensors of RunProgram operator, also the fetch "
......@@ -121,10 +119,6 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
platform::errors::NotFound(
"Input(X) of RunProgramGradOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInputs("Params"), true,
platform::errors::NotFound(
"Input(Params) of RunProgramGradOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInputs(framework::GradVarName("Out")), true,
platform::errors::NotFound(
......
......@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
auto output_vars = ctx.MultiOutputVar("Out");
auto input_var_names = ctx.InputNames("X");
auto param_names = ctx.InputNames("Params");
auto output_var_names = ctx.OutputNames("Out");
// current program may not hold parameters
std::vector<std::string> param_names;
if (!param_vars.empty()) {
param_names = ctx.InputNames("Params");
}
auto *block = ctx.Attr<BlockDesc *>("global_block");
auto *program = block->Program();
auto start_op_index = ctx.Attr<int64_t>("start_op_index");
......
......@@ -208,8 +208,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
PADDLE_ENFORCE_EQ(input_names_.empty(), false,
"should pass at least one input");
PADDLE_ENFORCE_EQ(
input_names_.empty(), false,
platform::errors::PreconditionNotMet(
"TensorRT engine needs at least one input, but no input is found. "
"Please check if you set the input correctly."));
std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping");
......@@ -295,12 +298,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
#endif
}
auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
PADDLE_ENFORCE_NOT_NULL(
fluid_v,
platform::errors::NotFound(
"Output variable %s is not found in TensorRT subgraph.", y));
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim));
PADDLE_ENFORCE(bind_index < num_bindings,
"The bind index should be less than num_bindings");
PADDLE_ENFORCE_LT(bind_index, num_bindings,
platform::errors::InvalidArgument(
"The binding index in TRT engine should be less "
"than the number of bindings, but got binding "
"index = %d, number of bindings = %d.",
bind_index, num_bindings));
buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
BOOST_GET_CONST(platform::CUDAPlace, dev_place)));
......
......@@ -241,6 +241,26 @@ class TileGradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
template <typename T>
class TileDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("tile");
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("repeat_times_tensor")) {
op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor"));
}
if (this->HasInput("RepeatTimes")) {
op->SetInput("RepeatTimes", this->Input("RepeatTimes"));
}
op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
} // namespace operators
......@@ -251,6 +271,8 @@ REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
ops::TileGradOpMaker<paddle::framework::OpDesc>,
ops::TileGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
ops::TileGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -81,5 +81,26 @@ inline std::vector<T> GetDataFromTensorList(
}
return vec_new_data;
}
inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
// 1. shape is a Tensor
if (ctx.HasInput("ShapeTensor")) {
auto* shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
auto vec_shape = GetDataFromTensor<int>(shape_tensor);
return framework::make_ddim(vec_shape);
}
// 2. shape is a list/tuple containing Tensor
auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
if (shape_tensor_list.size() > 0) {
auto vec_shape = GetDataFromTensorList(shape_tensor_list);
return framework::make_ddim(vec_shape);
}
// 3. shape is a list/tuple without containing Tensor
auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
return framework::make_ddim(vec_shape);
}
} // namespace operators
} // namespace paddle
......@@ -111,7 +111,9 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"fake_quantize_dequantize_moving_average_abs_max",
{"Out", "OutScale", "OutAccum", "OutState"}},
{"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
{"amp_check_finite_and_scale", {"Out", "FoundInfinite"}},
{"check_finite_and_unscale", {"Out", "FoundInfinite"}},
{"update_loss_scaling",
{"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
};
// clang-format off
......
......@@ -27,8 +27,6 @@ function(train_test TARGET_NAME)
endif()
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES LABELS "RUN_TYPE=DIST")
if(NOT WIN32 AND NOT APPLE)
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES TIMEOUT 150)
......
......@@ -154,6 +154,7 @@ echo Step 2. Buile Paddle ...
echo ========================================
call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*8/10
set build_times=1
:build_tp
echo Build third_party the %build_times% time:
......@@ -172,7 +173,7 @@ echo Build third_party successfully!
set build_times=1
:build_paddle
echo Build Paddle the %build_times% time:
msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln
if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1
if %build_times% GTR 2 (
......
......@@ -296,13 +296,13 @@ function check_style() {
commit_files=on
for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do
if ! pre-commit run --files $file_name ; then
git diff
commit_files=off
fi
done
if [ $commit_files == 'off' ];then
echo "code format error"
git diff 2>&1
exit 4
fi
trap : 0
......@@ -1447,7 +1447,7 @@ function example() {
cd ${PADDLE_ROOT}/tools
python sampcd_processor.py cpu;example_error=$?
if [ "$example_error" != "0" ];then
echo "Code instance execution failed"
echo "Code instance execution failed" >&2
exit 5
fi
}
......@@ -1456,15 +1456,25 @@ function summary_check_problems() {
set +x
local check_style_code=$1
local example_code=$2
local check_style_info=$3
local example_info=$4
if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
echo "========================================"
echo "summary problems:"
if [ $check_style_code -ne 0 -a $example_code -ne 0 ];then
echo "There are 2 errors: Code format error and Example code error."
else
[ $check_style_code -ne 0 ] && echo "There is 1 error: Code format error."
[ $example_code -ne 0 ] && echo "There is 1 error: Example code error."
fi
echo "========================================"
if [ $check_style_code -ne 0 ];then
echo "- Check code style failed! Please check the log and fix problems."
echo "*****Code format error***** Please fix it according to the diff information:"
echo "$check_style_info" | grep "code format error" -A $(echo "$check_style_info" | wc -l)
fi
if [ $example_code -ne 0 ];then
echo "- Check example code failed! Please check the log and fix problems."
echo "*****Example code error***** Please fix the error listed in the information:"
echo "$example_info" | grep "API check -- Example Code" -A $(echo "$example_info" | wc -l)
fi
[ $check_style_code -ne 0 ] && exit $check_style_code
[ $example_code -ne 0 ] && exit $example_code
......@@ -1486,15 +1496,16 @@ function main() {
;;
build_and_check)
set +e
$(check_style >&2)
check_style_info=$(check_style)
check_style_code=$?
generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
check_sequence_op_unittest
generate_api_spec ${PYTHON_ABI:-""} "PR"
$(example >&2)
set +e
example_info=$(example)
example_code=$?
summary_check_problems $check_style_code $example_code
summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
assert_api_spec_approvals
;;
build)
......
......@@ -75,6 +75,7 @@ from .tensor.creation import full_like #DEFINE_ALIAS
from .tensor.creation import triu #DEFINE_ALIAS
from .tensor.creation import tril #DEFINE_ALIAS
from .tensor.creation import meshgrid #DEFINE_ALIAS
from .tensor.creation import empty #DEFINE_ALIAS
from .tensor.linalg import matmul #DEFINE_ALIAS
from .tensor.linalg import dot #DEFINE_ALIAS
# from .tensor.linalg import einsum #DEFINE_ALIAS
......
......@@ -17,6 +17,7 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2
from paddle.fluid.framework import Variable, set_flags, core
from paddle.fluid.wrapped_decorator import wrap_decorator
import google.protobuf.text_format
import google.protobuf
__all__ = ["DistributedStrategy"]
......@@ -706,11 +707,7 @@ class DistributedStrategy(object):
**Notes**:
k_steps(int) The local steps for training before parameter synchronization. Default 1.
If strategy.auto is set True, the local steps will be calculated automatically during training.
The algorithm is referenced in this paper:
`Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
In this case, k_steps indicates the first local steps which is suggested setting to 1.
begin_step(int) The step of begining training by localsgd. Default 1.
Examples:
.. code-block:: python
......@@ -718,7 +715,8 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4}
strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30}
"""
return get_msg_dict(self.strategy.localsgd_configs)
......@@ -1133,7 +1131,91 @@ class DistributedStrategy(object):
return False
def __repr__(self):
spacing = 2
max_k = 38
max_v = 38
length = max_k + max_v + spacing
h1_format = " " + "|{{:^{}s}}|\n".format(length)
h2_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " *
spacing, max_v)
border = " +" + "".join(["="] * length) + "+"
line = " +" + "".join(["-"] * length) + "+"
draws = border + "\n"
draws += h1_format.format("")
draws += h1_format.format("DistributedStrategy Overview")
draws += h1_format.format("")
fields = self.strategy.DESCRIPTOR.fields
str_res = ""
env_draws = line + "\n"
for f in fields:
print("{}: {}".format(f.name, f.default_value))
return str(self.strategy)
if "build_strategy" in f.name or "execution_strategy" in f.name:
continue
if "_configs" in f.name:
continue
else:
if isinstance(getattr(self.strategy, f.name), bool):
if hasattr(self.strategy, f.name + "_configs"):
if getattr(self.strategy, f.name):
draws += border + "\n"
draws += h1_format.format(
"{} = True, please check {}_configs".format(
f.name, f.name))
draws += line + "\n"
my_configs = getattr(self.strategy,
f.name + "_configs")
config_fields = my_configs.DESCRIPTOR.fields
for ff in config_fields:
if isinstance(
getattr(my_configs, ff.name),
google.protobuf.pyext._message.
RepeatedScalarContainer):
values = getattr(my_configs, ff.name)
for i, v in enumerate(values):
if i == 0:
draws += h2_format.format(ff.name,
str(v))
else:
draws += h2_format.format("",
str(v))
else:
draws += h2_format.format(
ff.name,
str(getattr(my_configs, ff.name)))
else:
env_draws += h2_format.format(
f.name, str(getattr(self.strategy, f.name)))
else:
env_draws += h2_format.format(
f.name, str(getattr(self.strategy, f.name)))
result_res = draws + border + "\n" + h1_format.format(
"Environment Flags, Communication Flags")
result_res += env_draws
build_strategy_str = border + "\n"
build_strategy_str += h1_format.format("Build Strategy")
build_strategy_str += line + "\n"
fields = self.strategy.build_strategy.DESCRIPTOR.fields
for f in fields:
build_strategy_str += h2_format.format(
f.name, str(getattr(self.strategy.build_strategy, f.name)))
build_strategy_str += border + "\n"
execution_strategy_str = h1_format.format("Execution Strategy")
execution_strategy_str += line + "\n"
fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields:
execution_strategy_str += h2_format.format(
f.name, str(getattr(self.strategy.execution_strategy, f.name)))
execution_strategy_str += border + "\n"
result_res += build_strategy_str + execution_strategy_str
return result_res
......@@ -231,7 +231,7 @@ class Fleet(object):
Returns:
int: worker numbers
Examples:
.. code-block:: python
......@@ -737,7 +737,7 @@ class Fleet(object):
"""
Set the value of the learning rate manually in the optimizer.
Only work in dygraph mode
Args:
value (float|Tensor): the value of learning rate
......@@ -877,7 +877,7 @@ class Fleet(object):
"""
Execute the optimizer once.
Only work in dygraph mode
Returns: None
Examples:
......@@ -1019,7 +1019,7 @@ class Fleet(object):
if self.user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer
for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy)
opt._enable_strategy(self.user_defined_strategy, context)
valid_optimizer_list = []
valid_graph_optimizer_list = []
......
......@@ -347,12 +347,13 @@ def pretty_print_envs(envs, header=None):
for k, v in envs.items():
max_k = max(max_k, len(k))
h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v)
l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v)
h_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * spacing,
max_v)
l_format = " " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
length = max_k + max_v + spacing
border = "".join(["="] * length)
line = "".join(["-"] * length)
border = " +" + "".join(["="] * length) + "+"
line = " +" + "".join(["-"] * length) + "+"
draws = ""
draws += border + "\n"
......
......@@ -34,6 +34,9 @@ class AMPOptimizer(MetaOptimizerBase):
loss, role_maker, user_defined_optimizer, user_defined_strategy)
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.amp:
return True
return False
......@@ -42,7 +45,7 @@ class AMPOptimizer(MetaOptimizerBase):
dist_strategy.amp = False
dist_strategy.amp_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.amp = True
dist_strategy.amp_configs = {
"init_loss_scaling": 32768.0,
......
......@@ -53,6 +53,9 @@ class DGCOptimizer(MetaOptimizerBase):
name=opt._name)
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.dgc:
if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer")
......@@ -69,7 +72,7 @@ class DGCOptimizer(MetaOptimizerBase):
dist_strategy.dgc = False
dist_strategy.dgc_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.dgc = True
dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
......@@ -89,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase):
no_grad_set=None):
optimize_ops, params_grads = \
self.dgc_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
parameter_list, no_grad_set)
return optimize_ops, params_grads
......@@ -37,15 +37,18 @@ class GradientMergeOptimizer(MetaOptimizerBase):
self.user_defined_strategy.gradient_merge_configs["avg"])
def _can_apply(self):
if not self.role_maker._is_collective:
return False
can_apply = (self.user_defined_strategy.gradient_merge == True) and \
self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
return can_apply
def _disable_strategy(self, dist_strategy):
dist_strategy.gradient_merge = False
dist_strategy.gradient_merge_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# we currently do not support auto-enable gradient merge
return
......
......@@ -48,7 +48,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
callbacks=None):
pass
# should fix the variable
# should fix the variable
def _setup_nccl_op(self, startup_program, main_program, build_strategy):
trainer_endpoints = self.role_maker.get_trainer_endpoints()
trainers = trainer_endpoints
......@@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
dist_strategy = self.user_defined_strategy
local_build_strategy = paddle.fluid.BuildStrategy()
local_build_strategy.enable_sequential_execution = \
dist_strategy.build_strategy.enable_sequential_execution
dist_strategy.build_strategy.enable_sequential_execution
local_build_strategy.fuse_elewise_add_act_ops = \
dist_strategy.build_strategy.fuse_elewise_add_act_ops
dist_strategy.build_strategy.fuse_elewise_add_act_ops
local_build_strategy.fuse_bn_act_ops = \
dist_strategy.build_strategy.fuse_bn_act_ops
dist_strategy.build_strategy.fuse_bn_act_ops
local_build_strategy.enable_auto_fusion = \
dist_strategy.build_strategy.enable_auto_fusion
dist_strategy.build_strategy.enable_auto_fusion
local_build_strategy.fuse_relu_depthwise_conv = \
dist_strategy.build_strategy.fuse_relu_depthwise_conv
dist_strategy.build_strategy.fuse_relu_depthwise_conv
local_build_strategy.fuse_broadcast_ops = \
dist_strategy.build_strategy.fuse_broadcast_ops
dist_strategy.build_strategy.fuse_broadcast_ops
local_build_strategy.fuse_all_optimizer_ops = \
dist_strategy.build_strategy.fuse_all_optimizer_ops
dist_strategy.build_strategy.fuse_all_optimizer_ops
local_build_strategy.enable_inplace = \
dist_strategy.build_strategy.enable_inplace
dist_strategy.build_strategy.enable_inplace
local_build_strategy.use_hierarchical_allreduce = \
dist_strategy.use_hierarchical_allreduce
dist_strategy.use_hierarchical_allreduce
local_build_strategy.hierarchical_allreduce_inter_nranks = \
dist_strategy.hierarchical_allreduce_inter_nranks
dist_strategy.hierarchical_allreduce_inter_nranks
local_build_strategy.sync_batch_norm = \
dist_strategy.sync_batch_norm
dist_strategy.sync_batch_norm
local_build_strategy.fuse_all_reduce_ops = \
dist_strategy.fuse_all_reduce_ops
dist_strategy.fuse_all_reduce_ops
local_build_strategy.nccl_comm_num = \
dist_strategy.nccl_comm_num
dist_strategy.nccl_comm_num
if self.user_defined_strategy.recompute == True:
logging.warn(
......@@ -190,7 +190,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# TODO(guru4elephant): should close all PE related flags here
return
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# by default, graph execution strategy is enabled
return
......
......@@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase):
name=opt._name)
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.lamb:
if not isinstance(self.inner_opt, AdamOptimizer):
logging.warn(
......@@ -75,7 +78,7 @@ class LambOptimizer(MetaOptimizerBase):
dist_strategy.lamb = False
dist_strategy.lamb_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.lamb = True
dist_strategy.lamb_configs = {
"lamb_weight_decay": 0.01,
......@@ -91,6 +94,10 @@ class LambOptimizer(MetaOptimizerBase):
return self.lamb_opt.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks)
# the following function will be used by AMP if both LARS and AMP are turn on together.
def apply_gradients(self, params_grads):
return self.lamb_opt.apply_gradients(params_grads=params_grads)
def minimize_impl(self,
loss,
startup_program=None,
......@@ -98,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase):
no_grad_set=None):
optimize_ops, params_grads = \
self.lamb_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
parameter_list, no_grad_set)
return optimize_ops, params_grads
......@@ -44,13 +44,19 @@ class LarsOptimizer(MetaOptimizerBase):
parameter_list=opt._parameter_list,
regularization=opt.regularization,
grad_clip=opt._grad_clip,
name=opt._name)
name=opt._name,
exclude_from_weight_decay=configs['exclude_from_weight_decay'],
epsilon=configs['epsilon'])
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.lars:
if not isinstance(self.inner_opt, Momentum):
logging.warn(
"lars need the inner optimizer to be Momentum optimizer.")
"lars need the inner optimizer to be Momentum optimizer but got {}.".
format(self.inner_opt.type))
return False
return True
return False
......@@ -59,7 +65,7 @@ class LarsOptimizer(MetaOptimizerBase):
dist_strategy.lars = False
dist_strategy.lars_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.lars = True
dist_strategy.lars_configs = {
"lars_coeff": 0.01,
......@@ -75,6 +81,10 @@ class LarsOptimizer(MetaOptimizerBase):
return self.lars_opt.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks)
# the following function will be used by AMP if both LARS and AMP are turn on together.
def apply_gradients(self, params_grads):
return self.lars_opt.apply_gradients(params_grads=params_grads)
def minimize_impl(self,
loss,
startup_program=None,
......@@ -82,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase):
no_grad_set=None):
optimize_ops, params_grads = \
self.lars_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
parameter_list, no_grad_set)
return optimize_ops, params_grads
......@@ -29,6 +29,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
self.snapshot_key = '@SNAPSHOT'
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if not self.user_defined_strategy.localsgd:
return False
......@@ -36,17 +39,17 @@ class LocalSGDOptimizer(MetaOptimizerBase):
return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
def _disable_strategy(self, dist_strategy):
dist_strategy.localsgd = False
dist_strategy.localsgd_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.localsgd = True
dist_strategy.localsgd_configs = {"k_steps": 1}
dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
def snapshot_name(self, param_name):
return param_name + self.snapshot_key
......@@ -83,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
minimized = self.inner_opt.minimize(
loss, startup_program=startup_program)
init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps']
auto_steps = self.user_defined_strategy.auto
k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
begin_step_value = self.user_defined_strategy.localsgd_configs[
'begin_step']
if startup_program is None:
startup_program = default_startup_program()
......@@ -98,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
p2s = self.create_snapshot_vars(main_block.program)
with program_guard(main_block.program, startup_program):
step = layers.autoincreased_step_counter(begin=0)
step = layers.autoincreased_step_counter(begin=1)
k_steps = layers.create_global_var(
name="k_steps",
shape=[1],
value=init_k_steps,
value=k_steps_value,
dtype='int64',
persistable=True)
begin_step = layers.create_global_var(
name="begin_step",
shape=[1],
value=begin_step_value,
dtype='int64',
persistable=True)
last_step = layers.create_global_var(
name="last_step",
shape=[1],
value=int(0),
value=begin_step_value,
dtype='int64',
persistable=True)
if auto_steps:
avg_loss = layers.collective._c_allreduce(
loss) / self.role_maker.worker_num()
lr_0 = layers.create_global_var(
name="lr_0",
shape=[1],
value=float(0),
dtype='float32',
persistable=True)
loss_0 = layers.create_global_var(
name="loss_0",
shape=[1],
value=float(0),
dtype='float32',
persistable=True)
global_lr = self.inner_opt._global_learning_rate()
def initialize():
layers.assign(loss, loss_0)
layers.assign(global_lr, lr_0)
layers.cond(step == 0, initialize)
def communicate():
sub_block = default_main_program().current_block()
ring_id = -1
......@@ -192,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]},
outputs={'Out': [snapshot]},
attrs={OP_ROLE_KEY: OpRole.Optimize})
if auto_steps:
next_local_steps = layers.cast(
layers.ceil(
layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
float(init_k_steps))),
dtype='int64')
max_local_steps = layers.fill_constant(
shape=[1], dtype='int64', value=16)
next_local_steps = layers.elementwise_min(next_local_steps,
max_local_steps)
layers.assign(next_local_steps, k_steps)
layers.assign(step, last_step)
layers.cond(step - last_step == k_steps, communicate)
def begin_localsgd():
layers.cond(step - last_step == k_steps, communicate)
layers.cond(step > begin_step, begin_localsgd, communicate)
return minimized
......@@ -48,7 +48,7 @@ class MetaOptimizerBase(Optimizer):
raise NotImplementedError("you should implement disable strategy in {}".
format(type(self).__name__))
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context=None):
raise NotImplementedError("you should implement enable strategy in {}".
format(type(self).__name__))
......
......@@ -24,6 +24,9 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
self.meta_optimizers_white_list = []
def _can_apply(self):
if self.role_maker._is_collective:
return False
k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
if k_steps < 0:
return False
......@@ -37,12 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
return True
def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {}
return
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# only open up the async mode for auto-parallel
dist_strategy.a_sync = True
dist_strategy.a_sync_configs = {}
return
def _is_graph_out(self):
return True
......
......@@ -32,8 +32,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
def _can_apply(self):
if self.role_maker._is_collective:
return False
if self.user_defined_strategy.auto == True:
return True
k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
return True if k_steps >= 0 else False
......@@ -134,7 +132,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return _main, _startup
def _try_auto_apply_geo(self, program, compiled_config):
def _can_apply_geo(self, dist_strategy, program):
def get_sys_free_mem():
plat = platform.system()
if platform.system() == "Darwin":
......@@ -163,36 +161,28 @@ class ParameterServerOptimizer(MetaOptimizerBase):
"%s platform is unsupported is parameter server optimizer" %
(platform.system()))
if self.user_defined_strategy.auto == False:
return
a_sync_configs = self.user_defined_strategy.a_sync_configs
if a_sync_configs["k_steps"] >= 0:
return
self.user_defined_strategy.a_sync = True
if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
# auto async
a_sync_configs["k_steps"] = 0
self.user_defined_strategy.a_sync_configs = a_sync_configs
return
return False
from paddle.fluid.incubate.fleet.parameter_server.ir.vars_metatools import dtype_to_size
free = get_sys_free_mem()
param_grad_pairs = compiled_config.origin_sparse_pairs + compiled_config.origin_dense_pairs
processed_var_names = set(["@EMPTY@"])
from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
processed_var_names = set(["@EMPTY@"])
param_memory_size = 0
for param_grad_pair in param_grad_pairs:
param, grad = param_grad_pair
for varname in program.global_block().vars:
var = program.global_block().vars[varname]
if not var.persistable or var.desc.type(
) != core.VarDesc.VarType.LOD_TENSOR:
continue
param = vars_metatools.create_var_struct(var)
param_memory_size += param.m_size
processed_var_names.add(param.name)
processed_var_names.add(varname)
upper_mem_use = param_memory_size * 5.0
program_tmp_vars = dict()
batch_size = 1024
eval_batch_size = 1024
for op in program.global_block().ops:
for var_name in op.output_arg_names:
if var_name in processed_var_names:
......@@ -215,23 +205,21 @@ class ParameterServerOptimizer(MetaOptimizerBase):
data_count *= (-x)
else:
data_count *= x
program_tmp_vars[var_name] = (data_count, neg_dim_count,
dtype_to_size[var.dtype])
program_tmp_vars[var_name] = (
data_count, neg_dim_count,
vars_metatools.dtype_to_size[var.dtype])
for varname in program_tmp_vars:
data_count, neg_dim_count, type_size = program_tmp_vars[varname]
if neg_dim_count == 1:
data_count *= batch_size
data_count *= eval_batch_size
var_memory = data_count * type_size
upper_mem_use += var_memory
if upper_mem_use < free:
# auto geo
a_sync_configs["k_steps"] = 800
return True
else:
# auto async
a_sync_configs["k_steps"] = 0
self.user_defined_strategy.a_sync_configs = a_sync_configs
return False
def minimize_impl(self,
loss,
......@@ -240,6 +228,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
no_grad_set=None):
self.inner_opt.minimize(loss, startup_program, parameter_list,
no_grad_set)
strategy = self._get_distributed_strategy()
_origin_main_program = loss.block.program
_origin_startup_program = startup_program
......@@ -247,11 +236,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
compiled_config = public.CompileTimeStrategy(_origin_main_program,
_origin_startup_program,
None, self.role_maker)
self._try_auto_apply_geo(_origin_main_program, compiled_config)
strategy = self._get_distributed_strategy()
strategy, self.role_maker)
compiled_config.strategy = strategy
if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
......@@ -267,9 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return None, None
def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {}
self.user_defined_strategy.a_sync_configs = {}
dist_strategy.a_sync = False
a_sync_configs = dist_strategy.a_sync_configs
a_sync_configs["k_steps"] = -1
dist_strategy.a_sync_configs = a_sync_configs
def _enable_strategy(self, dist_strategy, context):
a_sync_configs = dist_strategy.a_sync_configs
if a_sync_configs["k_steps"] >= 0:
return
def _enable_strategy(self, dist_strategy):
dist_strategy.a_sync = True
dist_strategy.a_sync_configs = {}
a_sync_configs = dist_strategy.a_sync_configs
is_geo = self._can_apply_geo(dist_strategy,
context["origin_main_program"])
if is_geo:
a_sync_configs["k_steps"] = 800
else:
a_sync_configs["k_steps"] = 0
dist_strategy.a_sync_configs = a_sync_configs
......@@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase):
self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.pipeline == True:
return True
return False
......@@ -111,7 +114,7 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy.pipeline = False
dist_strategy.pipeline_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# we do not support enable pipeline automatically right now
return
......@@ -180,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase):
grad = None
for idx, op in reversed(list(enumerate(block.ops))):
if is_backward_op(op) and \
OP_ROLE_VAR_KEY in op.attr_names:
OP_ROLE_VAR_KEY in op.attr_names:
op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
if len(op_role_var) == 0:
continue
......
......@@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase):
list(user_defined_strategy.recompute_configs["checkpoints"]))
def _can_apply(self):
if self.role_maker._is_collective:
return False
if self.user_defined_strategy.recompute == True:
if len(self.user_defined_strategy.recompute_configs[
"checkpoints"]) == 0:
......@@ -49,7 +52,7 @@ class RecomputeOptimizer(MetaOptimizerBase):
dist_strategy.recompute = False
dist_strategy.recompute_configs = {}
def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# we do not support automatically recompute checkpoints currently
return
......
......@@ -1756,6 +1756,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
op_path_dict = dict()
op_path = _find_op_path_(block, targets, inputs, block_no_grad_set,
op_path_dict)
# find no grad var by op_path
no_grad_vars = _find_no_grad_vars(block, op_path, targets,
block_no_grad_set)
block_no_grad_set.update(no_grad_vars)
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
grad_to_var = dict()
grad_info_map = dict()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.data_feeder import check_variable_and_dtype, check_type
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.framework import Variable
__all__ = ['check_finite_and_unscale', 'update_loss_scaling']
def check_finite_and_unscale(x, scale, name=None):
"""
Check if input X contains all finite data, if yes, scale it by input Scale.
$$Out = X / scale$$
If any tensor in X contains Inf or Nan, the Out will generate a indicator.
FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
Out should not be used, and its data may not be deterministic.
Otherwise, FoundInfinite will be 0 (False).
Args:
x(list|tuple): The input tensors of check_finite_and_unscale operator.
scale: The scale of check_finite_and_unscale operator.
"""
check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
for e in x:
check_variable_and_dtype(e, "x", ['float32', 'float64'],
'check_finite_and_unscale')
helper = LayerHelper("check_finite_and_unscale", **locals())
found_inf = helper.create_variable_for_type_inference(dtype='bool')
inputs = {'X': x, 'Scale': scale}
outputs = {'Out': x, 'FoundInfinite': found_inf}
helper.append_op(
type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
return x, found_inf
def update_loss_scaling(x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name=None):
"""
Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite.
Args:
x(list|tuple): The input tensors of update_loss_scaling operator.
found_inf (Variable): A boolean variable indicates whether
there is any infinite gradient.
prev_loss_scaling (Variable): Previous loss scaling.
num_good_steps (Variable): A variable accumulates good steps in which
all gradients are finite.
num_bad_steps (Variable): A variable accumulates bad steps in which
some gradients are infinite.
incr_every_n_steps (int): A variable represents increasing loss
scaling every n consecutive steps with
finite gradients.
decr_every_n_nan_or_inf (int): A variable represents decreasing
loss scaling every n accumulated
steps with nan or inf gradients.
incr_ratio(float): The multiplier to use when increasing the loss
scaling.
decr_ratio(float): The less-than-one-multiplier to use when decreasing
loss scaling.
"""
check_variable_and_dtype(prev_loss_scaling, "prev_loss_scaling",
['float32', 'float64'], "update_loss_scaling")
check_type(x, 'x', (tuple, list), 'update_loss_scaling')
for e in x:
check_variable_and_dtype(e, "x", ['float32', 'float64'],
'update_loss_scaling')
assert prev_loss_scaling.dtype == e.dtype, "The dtype of prev_loss_scaling should be equal to the dtype of x."
helper = LayerHelper("update_loss_scaling", **locals())
inputs = {
'X': x,
'FoundInfinite': found_inf,
'PrevLossScaling': prev_loss_scaling,
'InGoodSteps': num_good_steps,
'InBadSteps': num_bad_steps
}
outputs = {
'Out': x,
'LossScaling': prev_loss_scaling,
'OutGoodSteps': num_good_steps,
'OutBadSteps': num_bad_steps
}
attrs = {
'incr_every_n_steps': incr_every_n_steps,
'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
'incr_ratio': incr_ratio,
'decr_ratio': decr_ratio,
}
helper.append_op(
type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
return x
......@@ -17,9 +17,11 @@ from ... import default_startup_program
from ... import layers
from ... import unique_name
from . import fp16_utils
from .fp16_utils import update_loss_scaling, rewrite_program
from .fp16_utils import rewrite_program
from .fp16_utils import update_role_var_grad
from .fp16_lists import AutoMixedPrecisionLists
from .amp_nn import check_finite_and_unscale
from .amp_nn import update_loss_scaling
__all__ = ["decorate"]
......@@ -67,10 +69,8 @@ class OptimizerWithMixedPrecision(object):
persistable=True)
self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
if self._use_dynamic_loss_scaling:
self._incr_every_n_steps = layers.fill_constant(
shape=[1], dtype='int32', value=incr_every_n_steps)
self._decr_every_n_nan_or_inf = layers.fill_constant(
shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
self._incr_every_n_steps = incr_every_n_steps
self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
self._incr_ratio = incr_ratio
self._decr_ratio = decr_ratio
self._num_good_steps = layers.create_global_var(
......@@ -139,49 +139,46 @@ class OptimizerWithMixedPrecision(object):
# Change the op_role_var attr for some ops, so that gradients
# transferred across GPUs can be FP16.
update_role_var_grad(self._train_program, self._params_grads)
scaled_params_grads = []
for p, g in self._params_grads:
with self._train_program._optimized_guard([p, g]):
scaled_g = g / self._loss_scaling
scaled_params_grads.append([p, scaled_g])
return scaled_params_grads
return self._params_grads
def apply_gradients(self, scaled_params_grads):
def apply_gradients(self, params_grads):
"""
Check scaled gradients to determine whether to update loss scaling and update
parameters by their scaled gradients,
Args:
scaled_params_grads (list): A list of params and scaled grads.
params_grads (list): A list of params and scaled grads.
Returns:
A list of optimize operators.
"""
if self._use_dynamic_loss_scaling:
grads = [g for _, g in params_grads]
with self._train_program._optimized_guard(grads):
grads, found_inf = check_finite_and_unscale(
grads, self._loss_scaling, name="find_infinite_scale")
grads = [layers.reduce_sum(g) for [_, g] in scaled_params_grads]
all_grads = layers.concat(grads)
all_grads_sum = layers.reduce_sum(all_grads)
is_overall_finite = layers.isfinite(all_grads_sum)
update_loss_scaling(is_overall_finite, self._loss_scaling,
self._num_good_steps, self._num_bad_steps,
self._incr_every_n_steps,
self._decr_every_n_nan_or_inf, self._incr_ratio,
self._decr_ratio)
# apply_gradient append all ops in global block, thus we shouldn't
# apply gradient in the switch branch.
with layers.Switch() as switch:
with switch.case(is_overall_finite):
pass
with switch.default():
for _, g in scaled_params_grads:
layers.assign(layers.zeros_like(g), g)
optimize_ops = self._optimizer.apply_gradients(scaled_params_grads)
if self._use_dynamic_loss_scaling:
with self._train_program._optimized_guard(grads):
grads = update_loss_scaling(
grads,
found_inf,
self._loss_scaling,
self._num_good_steps,
self._num_bad_steps,
self._incr_every_n_steps,
self._decr_every_n_nan_or_inf,
self._incr_ratio,
self._decr_ratio,
name="update_loss_scaling")
params_unscaled_grads = []
for pg, new_g in zip(params_grads, grads):
params_unscaled_grads.append((pg[0], new_g))
# apply_gradient append all ops in global block, thus we shouldn't
# apply gradient in the switch branch.
optimize_ops = self._optimizer.apply_gradients(params_unscaled_grads)
return optimize_ops
......
......@@ -328,77 +328,3 @@ def update_role_var_grad(main_prog, params_grads):
raise ValueError("The op {0} is not in program".format(op))
block.desc._remove_op(op_idx, op_idx + 1)
block._sync_with_cpp()
def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
num_bad_steps, incr_every_n_steps,
decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
"""
Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite.
Args:
is_overall_finite (Variable): A boolean variable indicates whether
all gradients are finite.
prev_loss_scaling (Variable): Previous loss scaling.
num_good_steps (Variable): A variable accumulates good steps in which
all gradients are finite.
num_bad_steps (Variable): A variable accumulates bad steps in which
some gradients are infinite.
incr_every_n_steps (Variable): A variable represents increasing loss
scaling every n consecutive steps with
finite gradients.
decr_every_n_nan_or_inf (Variable): A variable represents decreasing
loss scaling every n accumulated
steps with nan or inf gradients.
incr_ratio(float): The multiplier to use when increasing the loss
scaling.
decr_ratio(float): The less-than-one-multiplier to use when decreasing
loss scaling.
"""
zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
with layers.Switch() as switch:
with switch.case(is_overall_finite):
should_incr_loss_scaling = layers.less_than(incr_every_n_steps,
num_good_steps + 1)
with layers.Switch() as switch1:
with switch1.case(should_incr_loss_scaling):
new_loss_scaling = prev_loss_scaling * incr_ratio
loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
with layers.Switch() as switch2:
with switch2.case(loss_scaling_is_finite):
layers.assign(new_loss_scaling, prev_loss_scaling)
with switch2.default():
pass
layers.assign(zero_steps, num_good_steps)
layers.assign(zero_steps, num_bad_steps)
with switch1.default():
layers.increment(num_good_steps)
layers.assign(zero_steps, num_bad_steps)
with switch.default():
should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf,
num_bad_steps + 1)
with layers.Switch() as switch3:
with switch3.case(should_decr_loss_scaling):
new_loss_scaling = prev_loss_scaling * decr_ratio
static_loss_scaling = \
layers.fill_constant(shape=[1],
dtype='float32',
value=1.0)
less_than_one = layers.less_than(new_loss_scaling,
static_loss_scaling)
with layers.Switch() as switch4:
with switch4.case(less_than_one):
layers.assign(static_loss_scaling,
prev_loss_scaling)
with switch4.default():
layers.assign(new_loss_scaling, prev_loss_scaling)
layers.assign(zero_steps, num_good_steps)
layers.assign(zero_steps, num_bad_steps)
with switch3.default():
layers.assign(zero_steps, num_good_steps)
layers.increment(num_bad_steps)
......@@ -192,7 +192,6 @@ class ImperativeQuantAware(object):
assert len(input_dtype) == len(
feed), "The length of input_shape should be equal to feed's."
prog_trans = dygraph.ProgramTranslator()
with dygraph.guard():
model.eval()
input_vars = []
......
......@@ -209,15 +209,24 @@ class FakeQuantAbsMax(layers.Layer):
return quant_out
def _get_fake_quant_type(quant_type, name, moving_rate, quant_bits, dtype,
quant_on_weight):
def _get_fake_quant_type(quant_type, **kwargs):
call_args = {
"name": kwargs.get("name", None),
"quant_bits": kwargs.get("quant_bits", 8),
"dtype": kwargs.get("dtype", "float32")
}
if quant_type == 'abs_max':
call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
elif quant_type == 'moving_average_abs_max':
call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
fake_quant_map = {
'abs_max':
lambda: FakeQuantAbsMax(name, quant_bits, dtype, quant_on_weight),
'moving_average_abs_max':
lambda: FakeQuantMovingAverage(name, moving_rate, quant_bits, dtype)
'abs_max': FakeQuantAbsMax,
'moving_average_abs_max': FakeQuantMovingAverage
}
return fake_quant_map[quant_type]()
return fake_quant_map[quant_type](**call_args)
class QuantizedConv2D(layers.Layer):
......@@ -247,11 +256,18 @@ class QuantizedConv2D(layers.Layer):
self.bias = getattr(layer, 'bias')
# For FakeQuant
self._fake_quant_weight = _get_fake_quant_type(
weight_quantize_type, self.weight.name, moving_rate, weight_bits,
self._dtype, True)
weight_quantize_type,
name=self.weight.name,
moving_rate=moving_rate,
quant_bits=weight_bits,
dtype=self._dtype,
quant_on_weight=True)
self._fake_quant_input = _get_fake_quant_type(
activation_quantize_type,
layer.full_name(), moving_rate, activation_bits, self._dtype, False)
name=layer.full_name(),
moving_rate=moving_rate,
quant_bits=activation_bits,
dtype=self._dtype)
def forward(self, input):
quant_input = self._fake_quant_input(input)
......@@ -326,11 +342,18 @@ class QuantizedLinear(layers.Layer):
self.bias = getattr(layer, 'bias')
# For FakeQuant
self._fake_quant_weight = _get_fake_quant_type(
weight_quantize_type, self.weight.name, moving_rate, weight_bits,
self._dtype, True)
weight_quantize_type,
name=self.weight.name,
moving_rate=moving_rate,
quant_bits=weight_bits,
dtype=self._dtype,
quant_on_weight=True)
self._fake_quant_input = _get_fake_quant_type(
activation_quantize_type,
layer.full_name(), moving_rate, activation_bits, self._dtype, False)
name=layer.full_name(),
moving_rate=moving_rate,
quant_bits=activation_bits,
dtype=self._dtype)
def forward(self, input):
quant_input = self._fake_quant_input(input)
......
......@@ -347,6 +347,92 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
return self.__next__()
# NOTE(chenweihang): _worker_loop must be top level method to be pickled
def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
collate_fn, init_fn, worker_id, num_workers,
use_shared_memory):
try:
# NOTE: [ mmap files clear ] When the child process exits unexpectedly,
# some shared memory objects may have been applied for but have not yet
# been put into the inter-process Queue. This part of the object needs
# to be cleaned up when the process ends.
CleanupFuncRegistrar.register(_cleanup_mmap)
# set signal handler
core._set_process_signal_handler()
global _worker_info
_worker_info = WorkerInfo(
id=worker_id, num_workers=num_workers, dataset=dataset)
init_exception = None
try:
if init_fn is not None:
init_fn(worker_id)
fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
collate_fn, True)
except:
init_exception = Exception("init_fn failed in worker {}: " \
"{}".format(worker_id, sys.exc_info()))
iterator_drained = False
parent_watch_dog = ParentWatchDog()
while parent_watch_dog.is_alive():
try:
data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
except queue.Empty:
continue
# None as poison piil, so worker event should be set
if data is None:
assert done_event.is_set() or iterator_drained, \
"get None when worker done_event set"
break
# If worker done event is set but get still get data in
# indices_queue, remaining data should be get and skipped.
if done_event.is_set() or iterator_drained:
continue
idx, indices = data
try:
if init_exception is not None:
batch = init_exception
init_exception = None
else:
batch = fetcher.fetch(indices)
except Exception as e:
if isinstance(
e, StopIteration) and dataset_kind == _DatasetKind.ITER:
out_queue.put(_IterableDatasetStopIteration(worker_id))
iterator_drained = True
else:
out_queue.put((idx, e))
else:
if use_shared_memory:
# FIXME(dkp): _convert_to_tensor_list only support np.array
# list now, should support paddle.Tensor list
if isinstance(batch[0][0], paddle.Tensor):
np_batch = []
for sample in batch:
np_batch.append([s.numpy() for s in sample])
batch = np_batch
tensor_list = core._convert_to_tensor_list(batch)
out_queue.put((idx, tensor_list))
core._remove_tensor_list_mmap_fds(tensor_list)
else:
out_queue.put((idx, batch))
except KeyboardInterrupt:
# NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
pass
except:
six.reraise(*sys.exc_info())
finally:
if use_shared_memory:
_cleanup_mmap()
class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
def __init__(self, loader):
super(_DataLoaderIterMultiProcess, self).__init__(loader)
......@@ -404,11 +490,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
indices_queue = multiprocessing.Queue()
self._indices_queues.append(indices_queue)
worker = multiprocessing.Process(
target=self._worker_loop,
target=_worker_loop,
args=(self._dataset, self._dataset_kind, indices_queue,
self._data_queue, self._workers_done_event,
self._collate_fn, self._worker_init_fn, i,
self._num_workers))
self._num_workers, self._use_shared_memory))
worker.daemon = True
worker.start()
self._workers.append(worker)
......@@ -483,90 +569,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._blocking_queue.kill()
logging.error("DataLoader reader thread raised an exception!")
def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
done_event, collate_fn, init_fn, worker_id, num_workers):
try:
# NOTE: [ mmap files clear ] When the child process exits unexpectedly,
# some shared memory objects may have been applied for but have not yet
# been put into the inter-process Queue. This part of the object needs
# to be cleaned up when the process ends.
CleanupFuncRegistrar.register(_cleanup_mmap)
# set signal handler
core._set_process_signal_handler()
global _worker_info
_worker_info = WorkerInfo(
id=worker_id, num_workers=num_workers, dataset=dataset)
init_exception = None
try:
if init_fn is not None:
init_fn(worker_id)
fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
collate_fn, True)
except:
init_exception = Exception("init_fn failed in worker {}: " \
"{}".format(worker_id, sys.exc_info()))
iterator_drained = False
parent_watch_dog = ParentWatchDog()
while parent_watch_dog.is_alive():
try:
data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
except queue.Empty:
continue
# None as poison piil, so worker event should be set
if data is None:
assert done_event.is_set() or iterator_drained, \
"get None when worker done_event set"
break
# If worker done event is set but get still get data in
# indices_queue, remaining data should be get and skipped.
if done_event.is_set() or iterator_drained:
continue
idx, indices = data
try:
if init_exception is not None:
batch = init_exception
init_exception = None
else:
batch = fetcher.fetch(indices)
except Exception as e:
if isinstance(
e,
StopIteration) and dataset_kind == _DatasetKind.ITER:
out_queue.put(_IterableDatasetStopIteration(worker_id))
iterator_drained = True
else:
out_queue.put((idx, e))
else:
if self._use_shared_memory:
# FIXME(dkp): _convert_to_tensor_list only support np.array
# list now, should support paddle.Tensor list
if isinstance(batch[0][0], paddle.Tensor):
np_batch = []
for sample in batch:
np_batch.append([s.numpy() for s in sample])
batch = np_batch
tensor_list = core._convert_to_tensor_list(batch)
out_queue.put((idx, tensor_list))
core._remove_tensor_list_mmap_fds(tensor_list)
else:
out_queue.put((idx, batch))
except KeyboardInterrupt:
# NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
pass
except:
six.reraise(*sys.exc_info())
finally:
if self._use_shared_memory:
_cleanup_mmap()
def _thread_loop(self):
while not self._thread_done_event.is_set():
batch = self._get_data()
......
......@@ -210,13 +210,12 @@ class AmpScaler(object):
def _unscale(self, optimizer):
if not self._enable:
return
inv_scale = 1.0 / self._scale
param_grads = [
param._grad_ivar() for param in optimizer._parameter_list
if param._grad_ivar() is not None
]
core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
self._found_inf)
core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
self._found_inf)
def _update(self):
"""
......
......@@ -25,6 +25,7 @@ from .tracer import Tracer
import logging
import objgraph
from ..data_feeder import convert_dtype
import warnings
__all__ = [
'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
......@@ -609,10 +610,10 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
uint8, uint16, complex64, complex128}.
name(str, optional): The default value is None. Normally there is no
need for user to set this property. For more information, please
refer to :ref:`api_guide_Name` .
refer to :ref:`api_guide_Name` .
zero_copy(bool, optional): Whether to share memory with the input numpy
array. This parameter only works with CPUPlace and will be set to
True when it is None. Default: None.
True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.)
dtype(str, optional): The desired data type of returned ``Variable`` .
Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' ,
'int32' , 'int64' , 'uint8' . Default: None.
......@@ -665,8 +666,17 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
else:
if isinstance(framework._current_expected_place(),
framework.core.CPUPlace):
if zero_copy is None:
zero_copy = True
#TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
# (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy.
# Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
# (2): when used in flask framework, it may result in hang.
# Details: https://github.com/PaddlePaddle/Paddle/issues/26635
# So, we temporally diable the zero_copy strategy.
if zero_copy == True:
warnings.warn(
"Currently, zero_copy is not supported, and it will be discarded."
)
zero_copy = False
else:
assert not zero_copy, "zero_copy mode can only be used with CPUPlace"
......
......@@ -25,7 +25,7 @@ import warnings
from .. import core
from .base import guard
from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
__all__ = [
'save_dygraph',
......@@ -233,6 +233,19 @@ def load_dygraph(model_path, config=None):
para_dict = dict()
for var_name in persistable_var_dict:
para_dict[var_name] = persistable_var_dict[var_name].numpy()
# if __variables.info__ exists, we can recover structured_name
var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
if os.path.exists(var_info_path):
with open(var_info_path, 'rb') as f:
extra_var_info = pickle.load(f)
structured_para_dict = dict()
for var_name in para_dict:
structured_name = extra_var_info[var_name].get(
'structured_name', None)
assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
structured_para_dict[structured_name] = para_dict[var_name]
para_dict = structured_para_dict
else:
# Load state dict by `save_dygraph` save format
para_dict = {}
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import os
import six
import sys
import traceback
......@@ -20,6 +21,14 @@ from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginI
ERROR_DATA = "Error data about original source code information and traceback."
# A flag to set whether to open the dygraph2static error reporting module
SIMPLIFY_ERROR_ENV_NAME = "TRANSLATOR_SIMPLIFY_NEW_ERROR"
DEFAULT_SIMPLIFY_NEW_ERROR = 1
# A flag to set whether to display the simplified error stack
DISABLE_ERROR_ENV_NAME = "TRANSLATOR_DISABLE_NEW_ERROR"
DEFAULT_DISABLE_NEW_ERROR = 0
def attach_error_data(error, in_runtime=False):
"""
......@@ -103,7 +112,10 @@ class ErrorData(object):
# Simplify error value to improve readability if error is raised in runtime
if self.in_runtime:
self._simplify_error_value()
if int(
os.getenv(SIMPLIFY_ERROR_ENV_NAME,
DEFAULT_SIMPLIFY_NEW_ERROR)):
self._simplify_error_value()
message_lines.append(str(self.error_value))
return '\n'.join(message_lines)
......@@ -150,3 +162,22 @@ class ErrorData(object):
error_value_str = '\n'.join(error_value_lines)
self.error_value = self.error_type(error_value_str)
def raise_new_exception(self):
# Raises the origin error if disable dygraph2static error module,
if int(os.getenv(DISABLE_ERROR_ENV_NAME, DEFAULT_DISABLE_NEW_ERROR)):
raise
new_exception = self.create_exception()
if six.PY3:
# NOTE(liym27):
# 1. Why `raise new_exception from None`?
# In Python 3, by default, an new exception is raised with trace information of the caught exception.
# This only raises new_exception and hides unwanted implementation details from tracebacks of the
# caught exception.
# 2. Use exec to bypass syntax error checking in Python 2.
six.exec_("raise new_exception from None")
else:
raise new_exception
......@@ -124,8 +124,13 @@ class OriginInfoAttacher(gast.NodeTransformer):
def _abs_lineno(self, node):
# NOTE(liym27):
# If the first gast.FunctionDef has decorator, its lineno is 1, which
# equals to the lineno of the first decorator node.
# There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
# If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
# 1. < PY3.8
# its lineno equals to the lineno of the first decorator node, which is not right.
# 2. >= PY3.8
# its lineno is the actual lineno, which is right.
return self.lineno_offset + node.lineno
def _abs_col_offset(self, node):
......
......@@ -32,8 +32,7 @@ from paddle.fluid.layers.utils import flatten
from paddle.fluid.dygraph.base import param_guard
from paddle.fluid.dygraph.base import switch_to_static_graph
from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
from paddle.fluid.dygraph.dygraph_to_static import error
from paddle.fluid.dygraph.dygraph_to_static import logging_utils
from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
......@@ -315,6 +314,7 @@ class StaticLayer(object):
# 2. trace ops from dygraph layers and cache the generated program.
args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
try:
concrete_program, partial_program_layer = self.get_concrete_program(
*args, **kwargs)
......@@ -324,27 +324,22 @@ class StaticLayer(object):
partial_program_layer.training = self._class_instance.training
# 4. return outputs.
return partial_program_layer(args)
try:
return partial_program_layer(args)
except Exception as e:
if not hasattr(e, error.ERROR_DATA):
# runtime error
error.attach_error_data(e, in_runtime=True)
raise
except Exception as e:
if not hasattr(e, ERROR_DATA):
# runtime error
attach_error_data(e, in_runtime=True)
error_data = getattr(e, ERROR_DATA, None)
error_data = getattr(e, error.ERROR_DATA, None)
if error_data:
new_exception = error_data.create_exception()
if six.PY3:
# NOTE(liym27):
# 1. Why `raise new_exception from None`?
# In Python 3, by default, an new exception is raised with trace information of the caught exception.
# This only raises new_exception and hides unwanted implementation details from tracebacks of the
# caught exception.
# 2. Use exec to bypass syntax error checking in Python 2.
six.exec_("raise new_exception from None")
else:
raise new_exception
error_data.raise_new_exception()
else:
raise
logging_utils.warn(
"Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
" if you can't handle this {} yourself.".format(type(e)))
raise e
def _call_dygraph_function(self, *args, **kwargs):
"""
......@@ -593,7 +588,7 @@ class ConcreteProgram(object):
outputs = static_func(*inputs)
except BaseException as e:
# NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
attach_error_data(e)
error.attach_error_data(e)
raise
if not isinstance(outputs,
......@@ -813,28 +808,36 @@ class ProgramTranslator(object):
"The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
"We will just return dygraph output.")
return dygraph_func(*args, **kwargs)
function_spec = FunctionSpec(dygraph_func)
cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
getattr(dygraph_func,
'__self__', None))
_, partial_program_layer = self._program_cache[cache_key]
if args and isinstance(args[0], layers.Layer):
# Synchronize self.training attribute.
partial_program_layer.training = args[0].training
args = args[1:]
try:
return partial_program_layer(args)
function_spec = FunctionSpec(dygraph_func)
cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
getattr(dygraph_func,
'__self__', None))
_, partial_program_layer = self._program_cache[cache_key]
if args and isinstance(args[0], layers.Layer):
# Synchronize self.training attribute.
partial_program_layer.training = args[0].training
args = args[1:]
try:
return partial_program_layer(args)
except BaseException as e:
# NOTE:
# 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
# 2. If e raised in runtime, e should be attached to ERROR_DATA here.
if not hasattr(e, error.ERROR_DATA):
# runtime error
error.attach_error_data(e, in_runtime=True)
raise
except BaseException as e:
# NOTE:
# 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
# 2. If e raised in runtime, e should be attached to ERROR_DATA here.
if not hasattr(e, ERROR_DATA):
# runtime error
attach_error_data(e, in_runtime=True)
raise
error_data = getattr(e, error.ERROR_DATA, None)
if error_data:
error_data.raise_new_exception()
else:
logging_utils.warn(
"Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
" if you can't handle this {} yourself.".format(type(e)))
raise e
def get_func(self, dygraph_func):
"""
......
......@@ -479,11 +479,15 @@ def _load_persistable_vars(model_path,
var_file_path = os.path.join(model_path, params_filename)
else:
var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
framework._dygraph_tracer().trace_op(
type='load_combine',
inputs={},
outputs={'Out': load_var_list},
attrs={'file_path': var_file_path})
if not os.path.exists(var_file_path):
if len(extra_var_info) != 0:
raise ValueError("The model to be loaded is incomplete.")
else:
framework._dygraph_tracer().trace_op(
type='load_combine',
inputs={},
outputs={'Out': load_var_list},
attrs={'file_path': var_file_path})
return load_var_dict
......
......@@ -18,6 +18,7 @@ import os
import pickle
import warnings
import functools
from collections import OrderedDict
import six
import paddle
......@@ -211,7 +212,16 @@ def declarative(function=None, input_spec=None):
# for usage: `declarative(foo, ...)`
if function is not None:
return decorated(function)
if isinstance(function, Layer):
if isinstance(function.forward, StaticLayer):
class_name = function.__class__.__name__
warnings.warn(
"`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
format(class_name))
function.forward = decorated(function.forward)
return function
else:
return decorated(function)
# for usage: `@declarative`
return decorated
......@@ -633,6 +643,73 @@ class SaveLoadConfig(object):
self._keep_name_table = value
def _get_input_var_names(inputs, input_spec):
name_none_error = "The %s's name is None. " \
"When using jit.save, please set InputSepc's name in " \
"to_static(input_spec=[]) and jit.save(input_spec=[]) " \
"and make sure they are consistent."
name_no_exists_error = "The tensor `%s` does not exists. " \
"Please make sure the name of InputSpec or example Tensor " \
"in input_spec is the same as the name of InputSpec in " \
"`to_static` decorated on the Layer.forward method."
result_list = []
input_var_names = [var.name for var in inputs if isinstance(var, Variable)]
if input_spec is None:
# no prune
result_list = input_var_names
elif input_spec is not None and len(input_spec) == len(input_var_names):
# no prune
result_list = input_var_names
# if input spec name not in input_var_names, only raise warning
for spec in input_spec:
if spec.name is None:
warnings.warn(name_none_error % spec)
elif spec.name not in input_var_names:
warnings.warn(name_no_exists_error % spec.name)
else:
# do nothing
pass
else:
# prune
for spec in input_spec:
if spec.name is None:
# name is None, the input_spec only can be InputSpec
raise ValueError(name_none_error % spec)
elif spec.name not in input_var_names:
# the input_spec can be `InputSpec` or `VarBase`
raise ValueError(name_no_exists_error % spec.name)
else:
result_list.append(spec.name)
return result_list
def _get_output_vars(outputs, output_spec):
name_no_exists_error = "The tensor `%s` does not exists. " \
"Please make sure the name of example Tensor " \
"in configs.output_spec is the output tensor of " \
"Layer.forward method."
result_list = []
output_vars_dict = OrderedDict()
for var in outputs:
if isinstance(var, Variable):
output_vars_dict[var.name] = var
if output_spec is None:
result_list = output_vars_dict.values()
elif output_spec is not None and len(output_spec) == len(output_vars_dict):
result_list = output_vars_dict.values()
for var in output_spec:
if var.name not in output_vars_dict:
warnings.warn(name_no_exists_error % var.name)
else:
for var in output_spec:
if var.name not in output_vars_dict:
raise ValueError(name_no_exists_error % var.name)
else:
result_list.append(output_vars_dict[var.name])
return result_list
# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
def deprecate_save_load_configs(func):
@functools.wraps(func)
......@@ -753,26 +830,6 @@ def save(layer, model_path, input_spec=None, config=None):
paddle.jit.save(layer, model_path)
"""
def get_inout_spec(all_vars, target_vars, return_name=False):
result_list = []
valid_var_dict = {}
valid_vars = [var for var in all_vars if isinstance(var, Variable)]
for var in valid_vars:
valid_var_dict[var.name] = var
if target_vars:
for i, var in enumerate(target_vars):
# check target var whether exists
if var.name not in valid_var_dict:
raise RuntimeError(
"The variable to feed/fetch are not exist.")
result_list.append(valid_var_dict[var.name])
else:
result_list = valid_vars
if return_name:
result_list = [var.name for var in result_list]
return result_list
# 1. input check
prog_translator = ProgramTranslator()
if not prog_translator.enable:
......@@ -788,25 +845,58 @@ def save(layer, model_path, input_spec=None, config=None):
if configs is None:
configs = SaveLoadConfig()
# avoid change user given input_spec
inner_input_spec = None
if input_spec is not None:
if not isinstance(input_spec, list):
raise TypeError(
"The input input_spec should be 'list', but received input_spec's type is %s."
% type(input_spec))
inner_input_spec = []
for var in input_spec:
if not isinstance(var, (core.VarBase, Variable,
paddle.static.InputSpec)):
if isinstance(var, paddle.static.InputSpec):
inner_input_spec.append(var)
elif isinstance(var, (core.VarBase, Variable)):
inner_input_spec.append(
paddle.static.InputSpec.from_tensor(var))
else:
raise TypeError(
"The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
% type(var))
# 2. get program of declarative Layer.forward
if not isinstance(layer.forward, StaticLayer):
raise RuntimeError(
"layer.forward need to be decorated by `@declarative`.")
concrete_program = layer.forward.concrete_program
# NOTE: we maintain the mapping of variable name to
# 2. get program from Layer
# TODO(chenweihang): add support for other method, not only forward
if isinstance(layer.forward, StaticLayer):
concrete_program = layer.forward.concrete_program
else:
# transform in jit.save, if input_spec is incomplete, declarative will throw error
static_forward = declarative(layer.forward, input_spec=inner_input_spec)
concrete_program = static_forward.concrete_program
# the input_spec has been used in declarative, which is equal to
# @declarative with input_spec and jit.save without input_spec,
# avoid needless warning
inner_input_spec = None
# 3. build input & output of save_infernece_model
# NOTE(chenweihang): [ Get input variables name ]
# There are two cases, whether to prune the inputs or not
# - not prune inputs (recommend):
# - the len(input_spec) == len((concrete_program.inputs) - 1
# - here can use concrete_program.inputs directly
# - prune inputs:
# - the input_spec length < len((concrete_program.inputs) - 1
# - the input_spec's name should be in concrete_program.inputs
input_var_names = _get_input_var_names(concrete_program.inputs,
inner_input_spec)
# NOTE(chenweihang): [ Get output variables ]
# the rule is like [ Get input variables name ]. For output var,
# we only support VarBase spec, and actually, we only need the
# var name of output, and we don't recommended to use output_spec
output_vars = _get_output_vars(concrete_program.outputs,
configs.output_spec)
# NOTE(chenweihang): we maintain the mapping of variable name to
# structured name, the buffer variable (non-persistable)
# saved to inference program may not need by dygraph Layer,
# we only record the state_dict variable's structured name
......@@ -814,7 +904,7 @@ def save(layer, model_path, input_spec=None, config=None):
for structured_name, var in six.iteritems(layer.state_dict()):
state_names_dict[var.name] = structured_name
# 3. share parameters from Layer to scope & record var info
# 4. share parameters from Layer to scope & record var info
scope = core.Scope()
extra_var_info = dict()
for param_or_buffer in concrete_program.parameters:
......@@ -832,10 +922,6 @@ def save(layer, model_path, input_spec=None, config=None):
extra_info_dict['trainable'] = param_or_buffer.trainable
extra_var_info[param_or_buffer.name] = extra_info_dict
# 4. build input & output spec
input_var_names = get_inout_spec(concrete_program.inputs, input_spec, True)
output_vars = get_inout_spec(concrete_program.outputs, configs.output_spec)
# 5. save inference model
from paddle.fluid.io import save_inference_model
......@@ -856,7 +942,7 @@ def save(layer, model_path, input_spec=None, config=None):
export_for_deployment=configs._export_for_deployment,
program_only=configs._program_only)
# NOTE: [ Save extra variable info ]
# NOTE(chenweihang): [ Save extra variable info ]
# save_inference_model will lose some important variable information, including:
# - Variable name and correspondence (when saved variables as one file)
# - Variable.stop_gradient information
......
......@@ -19,7 +19,6 @@ from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
from ..layers.layer_function_generator import OpProtoHolder
from ..layers import common_methods
from . import to_variable, no_grad
import paddle
import numpy as np
import six
......@@ -163,26 +162,6 @@ def monkey_patch_math_varbase():
def _scalar_div_(var, value):
return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
# TODO(shenliang03): currently, it supports divide, floor_divide, remainder
# for binary operator by using the api to achieve the type promotion
def _binary_method_creator_(op_type, reverse=False):
import paddle
def __impl__(self, other_var):
import paddle
op = getattr(paddle, op_type)
if reverse:
return op(other_var, self)
else:
return op(self, other_var)
__impl__.__doc__ = """
See paddle.{}""".format(op_type)
__impl__.__name__ = op_type
return __impl__
# for binary operator such as elementwise, compare
def _binary_creator_(method_name,
op_type,
......@@ -281,20 +260,22 @@ def monkey_patch_math_varbase():
## a*b == b*a. Do not need to reverse explicitly
('__rmul__',
_binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
('__div__', _binary_creator_('__div__', 'elementwise_div', False,
_scalar_div_)),
('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
False, _scalar_div_)),
('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
None)),
('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
None)),
('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
None)),
('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
None)),
# These binary use paddle.optype
('__div__', _binary_method_creator_('divide', False)),
('__truediv__', _binary_method_creator_('divide', False)),
('__rtruediv__', _binary_method_creator_('divide', True)),
('__rdiv__', _binary_method_creator_('divide', True)),
('__floordiv__', _binary_method_creator_('floor_divide', False)),
('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
('__mod__', _binary_method_creator_('remainder', False)),
('__floordiv__', _binary_creator_('__floordiv__',
'elementwise_floordiv', False, None)),
('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
None)),
## for logical compare
('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
......
......@@ -16,7 +16,6 @@ from __future__ import print_function
import warnings
import inspect
import paddle
from .. import core
from ..framework import Variable, unique_name
......@@ -46,7 +45,6 @@ EXPRESSION_MAP = {
"__pow__": "A ** B",
"__rpow__": "A **= B",
"__floordiv__": "A //B",
"__rfloordiv__": "A //= B",
"__mod__": "A % B",
"__eq__": "A == B",
"__ne__": "A != B",
......@@ -235,25 +233,6 @@ def monkey_patch_variable():
def _scalar_div_(var, value):
return _scalar_op_(var, 1.0 / value, 0.0)
# TODO(shenliang03): currently, it supports divide, floor_divide, remainder
# for binary operator by using the api to achieve the type promotion
def _binary_method_creator_(op_type, reverse=False):
import paddle
def __impl__(self, other_var):
op = getattr(paddle, op_type)
if reverse:
return op(other_var, self)
else:
return op(self, other_var)
__impl__.__doc__ = """
See paddle.{}""".format(op_type)
__impl__.__name__ = op_type
return __impl__
def _binary_creator_(method_name,
op_type,
reverse=False,
......@@ -360,18 +339,22 @@ def monkey_patch_variable():
# a*b == b*a. Do not need to reverse explicitly
('__rmul__',
_binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
('__div__', _binary_creator_('__div__', 'elementwise_div', False,
_scalar_div_)),
('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
False, _scalar_div_)),
('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
None)),
('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
True, None)),
('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
None)),
('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
None)),
# These binary use paddle.optype
('__div__', _binary_method_creator_('divide', False)),
('__rdiv__', _binary_method_creator_('divide', True)),
('__truediv__', _binary_method_creator_('divide', False)),
('__rtruediv__', _binary_method_creator_('divide', True)),
('__floordiv__', _binary_method_creator_('floor_divide', False)),
('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
('__mod__', _binary_method_creator_('remainder', False)),
('__floordiv__', _binary_creator_('__floordiv__',
'elementwise_floordiv', False, None)),
('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
None)),
# for logical compare
('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
......
......@@ -6309,7 +6309,7 @@ def unsqueeze(input, axes, name=None):
if isinstance(axes, int):
axes = [axes]
elif isinstance(axes, Variable):
axes = [axes.numpy().item(0)]
axes = axes.numpy().tolist()
elif isinstance(axes, (list, tuple)):
axes = [
item.numpy().item(0) if isinstance(item, Variable) else item
......
......@@ -1604,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer):
& local\_learning\_rate = learning\_rate * lars\_coeff * \\
\\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
& velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
& velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
& param = param - velocity
......@@ -1628,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer):
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
Examples:
.. code-block:: python
......@@ -1659,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer):
parameter_list=None,
regularization=None,
grad_clip=None,
name=None):
name=None,
exclude_from_weight_decay=None,
epsilon=0):
assert learning_rate is not None
assert momentum is not None
super(LarsMomentumOptimizer, self).__init__(
......@@ -1672,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer):
self._momentum = momentum
self._lars_coeff = float(lars_coeff)
self._lars_weight_decay = float(lars_weight_decay)
self._epsilon = float(epsilon)
if exclude_from_weight_decay is None:
self._exclude_from_weight_decay = []
else:
self._exclude_from_weight_decay = exclude_from_weight_decay
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -1682,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer):
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
_lars_weight_decay = self._lars_weight_decay
param_name = param_and_grad[0].name
if len(self._exclude_from_weight_decay) > 0:
for name in self._exclude_from_weight_decay:
if name in param_name:
_lars_weight_decay = 0.0
break
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
# create the momentum optimize op
......@@ -1700,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer):
attrs={
"mu": self._momentum,
"lars_coeff": self._lars_coeff,
"lars_weight_decay": self._lars_weight_decay
"lars_weight_decay": _lars_weight_decay,
"epsilon": self._epsilon
},
stop_gradient=True)
......
......@@ -85,6 +85,30 @@ def _convert_places(places):
return ret
# NOTE(chenweihang): _reader_process_loop must be top level method to be pickled
def _reader_process_loop(batch_reader, data_queue):
try:
# set signal handler
core._set_process_signal_handler()
# NOTE: [ mmap files clear ] When the child process exits unexpectedly,
# some shared memory objects may have been applied for but have not yet
# been put into the inter-process Queue. This part of the object needs
# to be cleaned up when the process ends.
CleanupFuncRegistrar.register(_cleanup_mmap)
for batch in batch_reader():
tensor_list = core._convert_to_tensor_list(batch)
data_queue.put(tensor_list)
core._remove_tensor_list_mmap_fds(tensor_list)
data_queue.put(None)
except KeyboardInterrupt:
# NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
pass
except:
six.reraise(*sys.exc_info())
class DataLoaderBase(object):
def __init__(self):
self._places = None
......@@ -811,7 +835,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
global multiprocess_queue_set
multiprocess_queue_set.add(self._data_queue)
self._process = multiprocessing.Process(
target=self._reader_process_loop)
target=_reader_process_loop,
args=(self._batch_reader, self._data_queue))
self._process.daemon = True
self._process.start()
......@@ -867,28 +892,6 @@ class DygraphGeneratorLoader(DataLoaderBase):
self._blocking_queue.kill()
logging.error("DataLoader reader thread raised an exception!")
def _reader_process_loop(self):
try:
# set signal handler
core._set_process_signal_handler()
# NOTE: [ mmap files clear ] When the child process exits unexpectedly,
# some shared memory objects may have been applied for but have not yet
# been put into the inter-process Queue. This part of the object needs
# to be cleaned up when the process ends.
CleanupFuncRegistrar.register(_cleanup_mmap)
for batch in self._batch_reader():
tensor_list = core._convert_to_tensor_list(batch)
self._data_queue.put(tensor_list)
core._remove_tensor_list_mmap_fds(tensor_list)
self._data_queue.put(None)
except KeyboardInterrupt:
# NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
pass
except:
six.reraise(*sys.exc_info())
def _reader_thread_loop_for_multiprocess(self):
while not self._thread_done_event.is_set():
try:
......
......@@ -47,6 +47,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_distributed_strategy)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP})
......@@ -440,8 +441,6 @@ if(WITH_DISTRIBUTE)
# FIXME(seiriosX) will fix this
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_a_sync_optimizer_auto")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
......@@ -461,6 +460,7 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
if(NOT WIN32)
py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
......@@ -559,8 +559,8 @@ endif()
set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
test_parallel_executor_feed_persistable_var
test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
test_optimizer_in_control_flow test_dataloader_keep_order
test_data_norm_op test_imperative_using_non_zero_gpu
test_dataloader_keep_order
test_dataloader_unkeep_order
test_parallel_executor_fetch_isolated_var
test_parallel_executor_inference_feed_partial_data
......
......@@ -332,5 +332,31 @@ class TestDeclarativeAPI(unittest.TestCase):
func(np.ones(5).astype("int32"))
class TestDecorateModelDirectly(unittest.TestCase):
def setUp(self):
paddle.disable_static()
program_trans.enable(True)
self.x = to_variable(np.ones([4, 10]).astype('float32'))
def test_fake_input(self):
net = SimpleNet()
net = declarative(net)
y = net(self.x)
self.assertTrue(len(net.forward.program_cache) == 1)
def test_input_spec(self):
net = SimpleNet()
net = declarative(net, input_spec=[InputSpec([None, 8, 10])])
self.assertTrue(len(net.forward.inputs) == 1)
self.assertTrue(len(net.forward.program_cache) == 1)
input_shape = net.forward.inputs[0].shape
self.assertListEqual(list(input_shape), [-1, 8, 10])
# redecorate
net = declarative(net, input_spec=[InputSpec([None, 16, 10])])
input_shape = net.forward.inputs[0].shape
self.assertListEqual(list(input_shape), [-1, 16, 10])
if __name__ == '__main__':
unittest.main()
......@@ -14,15 +14,15 @@
from __future__ import print_function
import os
import inspect
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.core import EnforceNotMet
from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
from paddle.fluid.dygraph.dygraph_to_static import error
from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
from paddle.fluid.dygraph.jit import declarative
def inner_func():
......@@ -30,7 +30,7 @@ def inner_func():
return
@declarative
@paddle.jit.to_static
def func_error_in_compile_time(x):
x = fluid.dygraph.to_variable(x)
inner_func()
......@@ -41,14 +41,14 @@ def func_error_in_compile_time(x):
return x_v
@declarative
@paddle.jit.to_static
def func_error_in_compile_time_2(x):
x = fluid.dygraph.to_variable(x)
x = fluid.layers.reshape(x, shape=[1, 2])
return x
@declarative
@paddle.jit.to_static
def func_error_in_runtime(x, iter_num=3):
x = fluid.dygraph.to_variable(x)
two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
......@@ -61,6 +61,9 @@ class TestErrorInCompileTime(unittest.TestCase):
self.set_func()
self.set_input()
self.set_exception_type()
self.prog_trans = paddle.jit.ProgramTranslator()
self.simplify_error = 1
self.disable_error = 0
def set_func(self):
self.func = func_error_in_compile_time
......@@ -88,14 +91,38 @@ class TestErrorInCompileTime(unittest.TestCase):
for m in self.expected_message:
self.assertIn(m, error_message)
def test(self):
with fluid.dygraph.guard():
with self.assertRaises(self.exception_type) as cm:
self.func(self.input)
exception = cm.exception
error_data = getattr(exception, ERROR_DATA)
self.assertIsInstance(error_data, ErrorData)
self._test_create_message(error_data)
def _test_attach_and_raise_new_exception(self, func_call):
paddle.disable_static()
with self.assertRaises(self.exception_type) as cm:
func_call()
exception = cm.exception
error_data = getattr(exception, error.ERROR_DATA, None)
self.assertIsInstance(error_data, error.ErrorData)
self._test_create_message(error_data)
def test_static_layer_call(self):
# NOTE: self.func(self.input) is the StaticLayer().__call__(self.input)
call_dy2static = lambda: self.func(self.input)
self.set_flags(0)
self._test_attach_and_raise_new_exception(call_dy2static)
def test_program_translator_get_output(self):
call_dy2static = lambda : self.prog_trans.get_output(unwrap(self.func), self.input)
self.set_flags(0)
self._test_attach_and_raise_new_exception(call_dy2static)
def set_flags(self, disable_error=0, simplify_error=1):
os.environ[error.DISABLE_ERROR_ENV_NAME] = str(disable_error)
self.disable_error = int(os.getenv(error.DISABLE_ERROR_ENV_NAME, 0))
self.assertEqual(self.disable_error, disable_error)
os.environ[error.SIMPLIFY_ERROR_ENV_NAME] = str(simplify_error)
self.simplify_error = int(os.getenv(error.SIMPLIFY_ERROR_ENV_NAME, 1))
self.assertEqual(self.simplify_error, simplify_error)
class TestErrorInCompileTime2(TestErrorInCompileTime):
......@@ -143,5 +170,28 @@ class TestErrorInRuntime(TestErrorInCompileTime):
self.assertIn(m, error_message)
@unwrap
@paddle.jit.to_static()
def func_decorated_by_other_1():
return 1
@paddle.jit.to_static()
@unwrap
def func_decorated_by_other_2():
return 1
class TestErrorInOther(unittest.TestCase):
def test(self):
paddle.disable_static()
prog_trans = paddle.jit.ProgramTranslator()
with self.assertRaises(NotImplementedError):
prog_trans.get_output(func_decorated_by_other_1)
with self.assertRaises(NotImplementedError):
func_decorated_by_other_2()
if __name__ == '__main__':
unittest.main()
......@@ -14,6 +14,7 @@
from __future__ import print_function
import sys
import unittest
from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
......@@ -177,8 +178,20 @@ class TestOriginInfoWithDecoratedFunc(TestOriginInfo):
def set_dygraph_info(self):
self.line_num = 2
self.line_index_list = [0, 2]
self.dy_rel_lineno_list = [0, 2]
# NOTE(liym27):
# There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
# If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
# 1. < PY3.8
# its lineno equals to the lineno of the first decorator node, which is not right.
# 2. >= PY3.8
# its lineno is the actual lineno, which is right.
if sys.version_info >= (3, 8):
self.line_index_list = [1, 2]
self.dy_rel_lineno_list = [1, 2]
else:
self.line_index_list = [0, 2]
self.dy_rel_lineno_list = [0, 2]
self.dy_abs_col_offset = [0, 4]
self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
......@@ -199,8 +212,13 @@ class TestOriginInfoWithDecoratedFunc2(TestOriginInfo):
def set_dygraph_info(self):
self.line_num = 2
self.line_index_list = [0, 3]
self.dy_rel_lineno_list = [0, 3]
if sys.version_info >= (3, 8):
self.line_index_list = [2, 3]
self.dy_rel_lineno_list = [2, 3]
else:
self.line_index_list = [0, 3]
self.dy_rel_lineno_list = [0, 3]
self.dy_abs_col_offset = [0, 4]
self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.utils import flatten
from paddle.fluid.dygraph import declarative, ProgramTranslator
......@@ -151,5 +152,33 @@ class TestWithTrainAndEval(unittest.TestCase):
partial_layer._train_program)
class GPT2LMHeadModel(fluid.dygraph.Layer):
def __init__(self):
super(GPT2LMHeadModel, self).__init__()
self.embedding0 = paddle.nn.Embedding(20, 16)
self.embedding1 = paddle.nn.Embedding(20, 32)
self.lm_head_weight = paddle.to_tensor(
np.random.rand(2, 3).astype('float32'))
@declarative
def forward(self, x):
x = fluid.layers.reshape(x, shape=[-1, 6])
x1, x2, x3 = fluid.layers.split(input=x, dim=1, num_or_sections=3)
return x1
class TestPruneUnusedParamInProgram(unittest.TestCase):
def test_prune(self):
input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32")
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = GPT2LMHeadModel()
model.eval()
input_ids = paddle.to_tensor(input_ids)
out = model(input_ids)
self.assertTrue(np.array_equal(out.numpy(), [[15, 11]]))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
class TestFusionGRUINT8MKLDNNOp(OpTest):
def set_confs(self):
pass
def setUp(self):
self.op_type = "fusion_gru"
self.lod = [[2, 4, 3]]
self.IC = 3
self.OC = 5
self.is_reverse = False
self.with_h0 = False
self.with_bias = True
self.act_state = 'tanh'
self.act_gate = 'sigmoid'
self.origin_mode = True
self.use_mkldnn = True
self.force_fp32_output = True
self.error_margin = 1e-5
self.set_confs()
# RNN dimensions
T = sum(self.lod[0])
N = len(self.lod[0])
# Input data
x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
scale_data = 63
shift_data = 64
x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
# WeightX/WeightH data
wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1
wh = np.random.rand(self.OC, 3 * self.OC).astype('float32') * 2 - 1
# Calculating weight scales
# scales = 63 / max(abs(channel_wise(weightsX + weightsH)))
# WeightX data shape in PP: [IC, 3 * OC]
# WeightH data shape in PP: [OC, 2 * OC] + [OC, OC]
# Scales shape in oneDNN: [3, OC]
scale_ur = 63 / np.max(np.abs(
np.concatenate(
[
wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
.reshape(self.OC, 2 * self.OC)
],
axis=0)),
axis=0)
scale_o = 63 / np.max(np.abs(
np.concatenate(
[
wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
.reshape(self.OC, self.OC)
],
axis=0)),
axis=0)
scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
bias = np.random.rand(
1, 3 * self.OC).astype('float32') if self.with_bias else np.zeros(
(1, 3 * self.OC), dtype='float32')
h0 = np.random.rand(
N, self.OC).astype('float32') if self.with_h0 else np.zeros(
(N, self.OC), dtype='float32')
_, _, _, hidden_f32 = fusion_gru(x_f32, self.lod, h0, wx, wh, bias,
self.is_reverse, self.origin_mode,
ACTIVATION[self.act_state],
ACTIVATION[self.act_gate])
self.inputs = {'X': (x_u8, self.lod), 'WeightX': wx, 'WeightH': wh}
if self.with_bias:
self.inputs['Bias'] = bias
if self.with_h0:
self.inputs['H0'] = h0
if self.force_fp32_output:
self.error_margin = 1e-1
self.outputs = {'Hidden': (hidden_f32, self.lod)}
else:
self.error_margin = 1
hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
self.outputs = {'Hidden': (hidden_u8, self.lod)}
self.attrs = {
'activation': self.act_state,
'gate_activation': self.act_gate,
'is_reverse': self.is_reverse,
'origin_mode': self.origin_mode,
'use_mkldnn': self.use_mkldnn,
'force_fp32_output': self.force_fp32_output,
'Scale_data': scale_data,
'Shift_data': shift_data,
'Scale_weights': scale_weights
}
def test_check_output(self):
self.check_output(check_dygraph=False, atol=self.error_margin)
class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp):
def set_confs(self):
self.force_fp32_output = False
class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUINT8MKLDNNOp):
def set_confs(self):
self.origin_mode = False
class TestFusionGRUINT8MKLDNNOp4(TestFusionGRUINT8MKLDNNOp):
def set_confs(self):
self.with_bias = False
class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
def set_confs(self):
self.with_h0 = False
if __name__ == "__main__":
unittest.main()
......@@ -448,7 +448,6 @@ class TestAdamOpV2(unittest.TestCase):
def test_adam_op_with_state_dict(self):
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding(10, 10)
......@@ -517,6 +516,20 @@ class TestAdamOpV2(unittest.TestCase):
adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters())
def test_adam_op_with_sparse_input_and_weight_decay(self):
paddle.disable_static()
x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
x = paddle.to_tensor(x_data, stop_gradient=False)
emb = paddle.nn.Embedding(10, 10, sparse=True)
adam = paddle.optimizer.Adam(
0.001, parameters=emb.parameters(), weight_decay=0.01)
with self.assertRaises(RuntimeError):
out = emb(x)
out.backward()
adam.step()
if __name__ == "__main__":
unittest.main()
......@@ -18,9 +18,9 @@ from op_test import OpTest, skip_check_grad_ci
import paddle.fluid as fluid
class TestAmpCheckFiniteAndScaleOp(OpTest):
class TestCheckFiniteAndUnscaleOp(OpTest):
def setUp(self):
self.op_type = "amp_check_finite_and_scale"
self.op_type = "check_finite_and_unscale"
self.init_dtype()
x = np.random.random((1024, 1024)).astype(self.dtype)
scale = np.random.random((1)).astype(self.dtype)
......@@ -28,7 +28,7 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
self.inputs = {'X': [('x0', x)], 'Scale': scale}
self.outputs = {
'FoundInfinite': np.array([0]),
'Out': [('out0', x * scale)],
'Out': [('out0', x / scale)],
}
def init_dtype(self):
......@@ -38,9 +38,9 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
self.check_output()
class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
def setUp(self):
self.op_type = "amp_check_finite_and_scale"
self.op_type = "check_finite_and_unscale"
self.init_dtype()
x = np.random.random((1024, 1024)).astype(self.dtype)
x[128][128] = np.nan
......@@ -61,9 +61,9 @@ class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
self.check_output(no_check_set=['Out'])
class TestAmpCheckFiniteAndScaleOpWithInf(OpTest):
class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
def setUp(self):
self.op_type = "amp_check_finite_and_scale"
self.op_type = "check_finite_and_unscale"
self.init_dtype()
x = np.random.random((1024, 1024)).astype(self.dtype)
x[128][128] = np.inf
......
......@@ -15,7 +15,7 @@
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.backward import calc_gradient
......@@ -81,5 +81,22 @@ class TestDoubleGrad(unittest.TestCase):
self.assertEqual(12, out[0])
class TestGradientWithPrune(unittest.TestCase):
def test_prune(self):
x = fluid.data(name='x', shape=[3], dtype='float32')
x.stop_gradient = False
x1, x2, x3 = fluid.layers.split(x, dim=0, num_or_sections=3)
y = x1 * 2
x1_grad = fluid.gradients(y, x)
exe = fluid.Executor(fluid.CPUPlace())
main = fluid.default_main_program()
exe.run(fluid.default_startup_program())
out = exe.run(main,
feed={'x': np.ones([3]).astype('float32')},
fetch_list=[x1_grad])
self.assertTrue(np.array_equal(out[0], [2., 0., 0.]))
if __name__ == "__main__":
unittest.main()
......@@ -38,6 +38,7 @@ def create_test_class(op_type, typename, callback):
self.check_output()
def test_errors(self):
paddle.enable_static()
with program_guard(Program(), Program()):
x = fluid.layers.data(name='x', shape=[2], dtype='int32')
y = fluid.layers.data(name='y', shape=[2], dtype='int32')
......@@ -80,6 +81,7 @@ def create_paddle_case(op_type, callback):
self.place = paddle.CUDAPlace(0)
def test_api(self):
paddle.enable_static()
with program_guard(Program(), Program()):
x = fluid.data(name='x', shape=[4], dtype='int64')
y = fluid.data(name='y', shape=[4], dtype='int64')
......@@ -92,6 +94,7 @@ def create_paddle_case(op_type, callback):
self.assertEqual((res == self.real_result).all(), True)
def test_broadcast_api_1(self):
paddle.enable_static()
with program_guard(Program(), Program()):
x = paddle.static.data(
name='x', shape=[1, 2, 1, 3], dtype='int32')
......@@ -108,6 +111,7 @@ def create_paddle_case(op_type, callback):
self.assertEqual((res == real_result).all(), True)
def test_attr_name(self):
paddle.enable_static()
with program_guard(Program(), Program()):
x = fluid.layers.data(name='x', shape=[4], dtype='int32')
y = fluid.layers.data(name='y', shape=[4], dtype='int32')
......@@ -130,6 +134,7 @@ create_paddle_case('not_equal', lambda _a, _b: _a != _b)
class TestCompareOpError(unittest.TestCase):
def test_errors(self):
paddle.enable_static()
with program_guard(Program(), Program()):
# The input x and y of compare_op must be Variable.
x = fluid.layers.data(name='x', shape=[1], dtype="float32")
......@@ -140,6 +145,7 @@ class TestCompareOpError(unittest.TestCase):
class API_TestElementwise_Equal(unittest.TestCase):
def test_api(self):
paddle.enable_static()
with fluid.program_guard(fluid.Program(), fluid.Program()):
label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
......@@ -159,5 +165,31 @@ class API_TestElementwise_Equal(unittest.TestCase):
self.assertEqual((res == np.array([True, True])).all(), True)
class TestCompareOpPlace(unittest.TestCase):
def test_place_1(self):
paddle.enable_static()
place = paddle.CPUPlace()
if core.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
out = fluid.layers.less_than(label, limit, force_cpu=True)
exe = fluid.Executor(place)
res, = exe.run(fetch_list=[out])
self.assertEqual((res == np.array([False, False])).all(), True)
def test_place_2(self):
place = paddle.CPUPlace()
data_place = place
if core.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
data_place = paddle.CUDAPinnedPlace()
paddle.disable_static(place)
data = np.array([9], dtype="int64")
data_tensor = paddle.to_tensor(data, place=data_place)
result = data_tensor == 0
self.assertEqual((result.numpy() == np.array([False])).all(), True)
if __name__ == '__main__':
unittest.main()
......@@ -62,82 +62,6 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
def test_a_sync_optimizer2(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 800)
def test_a_sync_optimizer3(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
x_embedding = paddle.fluid.layers.embedding(
is_distributed=False,
input=input_x,
size=[1000000000, 100000],
param_attr=paddle.fluid.ParamAttr(
name="embedding",
initializer=paddle.fluid.initializer.Constant(value=0.01)),
is_sparse=True)
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_PSERVER_NUMS"] = "2"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def test_a_sync_optimizer3(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
x_embedding = paddle.fluid.layers.embedding(
is_distributed=False,
input=input_x,
size=[1000000000, 100000],
param_attr=paddle.fluid.ParamAttr(
name="embedding",
initializer=paddle.fluid.initializer.Constant(value=0.01)),
is_sparse=True)
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_PSERVER_NUMS"] = "2"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def test_a_sync_optimizer2(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 800)
if __name__ == "__main__":
unittest.main()
......@@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
["listen_and_serv"])
# block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
self.assertEqual([op.type for op in pserver.blocks[1].ops], [
"sum", "cast", "fill_constant", "elementwise_div", "floor",
"fill_constant", "elementwise_pow", "scale"
"sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
"scale"
])
# block1~2: optimize pass
......
......@@ -240,124 +240,25 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
self.assertEqual((out_result == (2 / x)).all(), True)
class TestDivideAPI(unittest.TestCase):
def setUp(self):
paddle.set_default_dtype("float64")
self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
def check_static_result(self, place):
# rule 1
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = np.array([1, 2, 3])
self.assertRaises(TypeError, paddle.divide, x=x, y=y)
# rule 2: both the inputs are not Tensor
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = 2
y = 4
res = paddle.divide(x, y)
exe = fluid.Executor(place)
np_z = exe.run(fluid.default_main_program(),
feed={},
fetch_list=[res])
self.assertEqual(np_z[0] == 0.5, True)
# rule 3:
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = fluid.data(name="y", shape=[3], dtype="float32")
self.assertRaises(TypeError, paddle.divide, x=x, y=y)
# rule 4: x is Tensor, y is scalar
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = 2
exe = fluid.Executor(place)
res = x / y
np_z = exe.run(fluid.default_main_program(),
feed={"x": np.array([2, 3, 4]).astype('float64')},
fetch_list=[res])
z_expected = np.array([1., 1.5, 2.])
self.assertEqual((np_z[0] == z_expected).all(), True)
# rule 5: y is Tensor, x is scalar
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = 2
exe = fluid.Executor(place)
res = y / x
np_z = exe.run(fluid.default_main_program(),
feed={"x": np.array([2, 8, 4]).astype('float64')},
fetch_list=[res])
z_expected = np.array([1., 0.25, 0.5])
self.assertEqual((np_z[0] == z_expected).all(), True)
# rule 6: y is Tensor, x is Tensor
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = fluid.data(name="y", shape=[3], dtype="float64")
exe = fluid.Executor(place)
res = x / y
np_z = exe.run(fluid.default_main_program(),
feed={
"x": np.array([2, 3, 4]).astype('float64'),
"y": np.array([1, 5, 2]).astype('float64')
},
fetch_list=[res])
z_expected = np.array([2., 0.6, 2.])
self.assertEqual((np_z[0] == z_expected).all(), True)
class TestDivideOp(unittest.TestCase):
def test_name(self):
with fluid.program_guard(fluid.Program()):
x = fluid.data(name="x", shape=[2, 3], dtype="float32")
y = fluid.data(name='y', shape=[2, 3], dtype='float32')
def test_static(self):
for place in self.places:
self.check_static_result(place=place)
y_1 = paddle.divide(x, y, name='div_res')
self.assertEqual(('div_res' in y_1.name), True)
def test_dygraph(self):
for place in self.places:
with fluid.dygraph.guard(place):
# rule 1 : avoid numpy.ndarray
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x)
self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
# rule 2: both the inputs are not Tensor
z = paddle.divide(3, 2)
self.assertEqual(z.numpy()[0] == 1.5, True)
# rule 3: both the inputs are Tensor
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x, dtype="float32")
y = paddle.to_tensor(np_y, dtype="float64")
self.assertRaises(TypeError, paddle.divide, x=x, y=y)
# rule 4: x is Tensor, y is scalar
np_x = np.array([2, 3, 4])
x = paddle.to_tensor(np_x, dtype="int32")
y = 2
z = x / y
z_expected = np.array([1., 1.5, 2.])
self.assertEqual((z_expected == z.numpy()).all(), True)
# rule 5: y is Tensor, x is scalar
np_x = np.array([2, 1, 4])
x = paddle.to_tensor(np_x, dtype="int32")
y = 2
z = y / x
z_expected = np.array([1., 2., 0.5])
self.assertEqual((z_expected == z.numpy()).all(), True)
# rule 6: y is Tensor, x is Tensor
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = x / y
z_expected = np.array([2., 0.6, 2.])
self.assertEqual((z_expected == z.numpy()).all(), True)
with fluid.dygraph.guard():
np_x = np.array([2, 3, 4]).astype('float64')
np_y = np.array([1, 5, 2]).astype('float64')
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = paddle.divide(x, y)
np_z = z.numpy()
z_expected = np.array([2., 0.6, 2.])
self.assertEqual((np_z == z_expected).all(), True)
if __name__ == '__main__':
......
......@@ -58,13 +58,6 @@ class TestElementwiseModOp(OpTest):
pass
class TestElementwiseModOpInverse(TestElementwiseModOp):
def init_input_output(self):
self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
self.out = np.floor_divide(self.x, self.y)
class TestElementwiseModOp_scalar(TestElementwiseModOp):
def init_input_output(self):
scale_x = random.randint(0, 100000000)
......@@ -74,124 +67,25 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
self.out = np.floor_divide(self.x, self.y)
class TestFloorDivideAPI(unittest.TestCase):
def setUp(self):
paddle.set_default_dtype("float64")
self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
def check_static_result(self, place):
# rule 1
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = np.array([1, 2, 3])
self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
# rule 2: both the inputs are not Tensor
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = 2
y = 4
res = paddle.floor_divide(x, y)
exe = fluid.Executor(place)
np_z = exe.run(fluid.default_main_program(),
feed={},
fetch_list=[res])
self.assertEqual(np_z[0] == 0., True)
# rule 3:
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = fluid.data(name="y", shape=[3], dtype="float32")
self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
# rule 4: x is Tensor, y is scalar
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = 2
exe = fluid.Executor(place)
res = x // y
np_z = exe.run(fluid.default_main_program(),
feed={"x": np.array([2, 3, 4]).astype('float64')},
fetch_list=[res])
z_expected = np.array([1., 1., 2.])
self.assertEqual((np_z[0] == z_expected).all(), True)
# rule 5: y is Tensor, x is scalar
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = 2
exe = fluid.Executor(place)
res = y // x
np_z = exe.run(fluid.default_main_program(),
feed={"x": np.array([2, 8, 4]).astype('float64')},
fetch_list=[res])
z_expected = np.array([1., 0., 0.])
self.assertEqual((np_z[0] == z_expected).all(), True)
# rule 6: y is Tensor, x is Tensor
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = fluid.data(name="y", shape=[3], dtype="float64")
exe = fluid.Executor(place)
res = x // y
np_z = exe.run(fluid.default_main_program(),
feed={
"x": np.array([2, 3, 4]).astype('float64'),
"y": np.array([1, 5, 2]).astype('float64')
},
fetch_list=[res])
z_expected = np.array([2., 0., 2.])
self.assertEqual((np_z[0] == z_expected).all(), True)
def test_static(self):
for place in self.places:
self.check_static_result(place=place)
class TestFloorDivideOp(unittest.TestCase):
def test_name(self):
with fluid.program_guard(fluid.Program()):
x = fluid.data(name="x", shape=[2, 3], dtype="int64")
y = fluid.data(name='y', shape=[2, 3], dtype='int64')
y_1 = paddle.floor_divide(x, y, name='div_res')
self.assertEqual(('div_res' in y_1.name), True)
def test_dygraph(self):
for place in self.places:
with fluid.dygraph.guard(place):
# rule 1 : avoid numpy.ndarray
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x)
self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
# rule 2: both the inputs are not Tensor
z = paddle.floor_divide(3, 2)
self.assertEqual(z.numpy()[0] == 1., True)
# rule 3: both the inputs are Tensor
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x, dtype="float32")
y = paddle.to_tensor(np_y, dtype="float64")
self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
# rule 4: x is Tensor, y is scalar
np_x = np.array([2, 3, 4])
x = paddle.to_tensor(np_x, dtype="int32")
y = 2
z = x // y
z_expected = np.array([1, 1, 2])
self.assertEqual((z_expected == z.numpy()).all(), True)
# rule 5: y is Tensor, x is scalar
np_x = np.array([2, 1, 4])
x = paddle.to_tensor(np_x, dtype="int32")
y = 2
z = y // x
z_expected = np.array([1, 2, 0])
self.assertEqual((z_expected == z.numpy()).all(), True)
# rule 6: y is Tensor, x is Tensor
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = x // y
z_expected = np.array([2., 0., 2.])
self.assertEqual((z_expected == z.numpy()).all(), True)
with fluid.dygraph.guard():
np_x = np.array([2, 3, 8, 7]).astype('int64')
np_y = np.array([1, 5, 3, 3]).astype('int64')
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = paddle.floor_divide(x, y)
np_z = z.numpy()
z_expected = np.array([2, 0, 2, 2])
self.assertEqual((np_z == z_expected).all(), True)
with fluid.dygraph.guard(fluid.CPUPlace()):
# divide by zero
......
......@@ -84,149 +84,41 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
self.dtype = np.float64
class TestRemainderAPI(unittest.TestCase):
def setUp(self):
paddle.set_default_dtype("float64")
self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
def check_static_result(self, place):
# rule 1
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = np.array([1, 2, 3])
self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
# rule 3:
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = fluid.data(name="y", shape=[3], dtype="float32")
self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
# rule 4: x is Tensor, y is scalar
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = 2
exe = fluid.Executor(place)
res = x % y
np_z = exe.run(fluid.default_main_program(),
feed={"x": np.array([2, 3, 4]).astype('float64')},
fetch_list=[res])
z_expected = np.array([0., 1., 0.])
self.assertEqual((np_z[0] == z_expected).all(), True)
# rule 5: y is Tensor, x is scalar
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = 3
y = fluid.data(name="y", shape=[3], dtype="float32")
self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
# rule 6: y is Tensor, x is Tensor
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[3], dtype="float64")
y = fluid.data(name="y", shape=[1], dtype="float64")
exe = fluid.Executor(place)
res = x % y
np_z = exe.run(fluid.default_main_program(),
feed={
"x": np.array([1., 2., 4]).astype('float64'),
"y": np.array([1.5]).astype('float64')
},
fetch_list=[res])
z_expected = np.array([1., 0.5, 1.0])
self.assertEqual((np_z[0] == z_expected).all(), True)
# rule 6: y is Tensor, x is Tensor
with fluid.program_guard(fluid.Program(), fluid.Program()):
x = fluid.data(name="x", shape=[6], dtype="float64")
y = fluid.data(name="y", shape=[1], dtype="float64")
exe = fluid.Executor(place)
res = x % y
np_z = exe.run(
fluid.default_main_program(),
feed={
"x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
"y": np.array([2]).astype('float64')
},
fetch_list=[res])
z_expected = np.array([1., 0., 1., 1., 0., 1.])
self.assertEqual((np_z[0] == z_expected).all(), True)
def test_static(self):
for place in self.places:
self.check_static_result(place=place)
class TestRemainderOp(unittest.TestCase):
def test_name(self):
with fluid.program_guard(fluid.Program()):
x = fluid.data(name="x", shape=[2, 3], dtype="int64")
y = fluid.data(name='y', shape=[2, 3], dtype='int64')
y_1 = paddle.remainder(x, y, name='div_res')
self.assertEqual(('div_res' in y_1.name), True)
def test_dygraph(self):
for place in self.places:
with fluid.dygraph.guard(place):
# rule 1 : avoid numpy.ndarray
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x)
self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
# rule 3: both the inputs are Tensor
np_x = np.array([2, 3, 4])
np_y = np.array([1, 5, 2])
x = paddle.to_tensor(np_x, dtype="float32")
y = paddle.to_tensor(np_y, dtype="float64")
self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
# rule 4: x is Tensor, y is scalar
np_x = np.array([2, 3, 4])
x = paddle.to_tensor(np_x, dtype="int32")
y = 2
z = x % y
z_expected = np.array([0, 1, 0])
self.assertEqual((z_expected == z.numpy()).all(), True)
# rule 5: y is Tensor, x is scalar
np_x = np.array([2, 3, 4])
x = paddle.to_tensor(np_x)
self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
# rule 6: y is Tensor, x is Tensor
np_x = np.array([1., 2., 4])
np_y = np.array([1.5])
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = x % y
z_expected = np.array([1., 0.5, 1.0])
self.assertEqual((z_expected == z.numpy()).all(), True)
# rule 6: y is Tensor, x is Tensor
np_x = np.array([-3., -2, -1, 1, 2, 3])
np_y = np.array([2.])
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = x % y
z_expected = np.array([1., 0., 1., 1., 0., 1.])
self.assertEqual((z_expected == z.numpy()).all(), True)
np_x = np.array([-3.3, 11.5, -2, 3.5])
np_y = np.array([-1.2, 2., 3.3, -2.3])
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = x % y
z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
self.assertEqual(np.allclose(z_expected, z.numpy()), True)
np_x = np.array([-3, 11, -2, 3])
np_y = np.array([-1, 2, 3, -2])
x = paddle.to_tensor(np_x, dtype="int64")
y = paddle.to_tensor(np_y, dtype="int64")
z = x % y
z_expected = np.array([0, 1, 1, -1])
self.assertEqual(np.allclose(z_expected, z.numpy()), True)
np_x = np.array([-3, 3])
np_y = np.array([[2, 3], [-2, -1]])
x = paddle.to_tensor(np_x, dtype="int64")
y = paddle.to_tensor(np_y, dtype="int64")
z = x % y
z_expected = np.array([[1, 0], [-1, 0]])
self.assertEqual(np.allclose(z_expected, z.numpy()), True)
with fluid.dygraph.guard():
np_x = np.array([2, 3, 8, 7]).astype('int64')
np_y = np.array([1, 5, 3, 3]).astype('int64')
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = paddle.remainder(x, y)
np_z = z.numpy()
z_expected = np.array([0, 3, 2, 1])
self.assertEqual((np_z == z_expected).all(), True)
np_x = np.array([-3.3, 11.5, -2, 3.5])
np_y = np.array([-1.2, 2., 3.3, -2.3])
x = paddle.to_tensor(np_x)
y = paddle.to_tensor(np_y)
z = x % y
z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
self.assertEqual(np.allclose(z_expected, z.numpy()), True)
np_x = np.array([-3, 11, -2, 3])
np_y = np.array([-1, 2, 3, -2])
x = paddle.to_tensor(np_x, dtype="int64")
y = paddle.to_tensor(np_y, dtype="int64")
z = x % y
z_expected = np.array([0, 1, 1, -1])
self.assertEqual(np.allclose(z_expected, z.numpy()), True)
if __name__ == '__main__':
......
#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
from op_test import OpTest
from paddle.fluid import Program, program_guard
from paddle.fluid.framework import convert_np_dtype_to_dtype_
# Situation 1: Attr(shape) is a list(without tensor)
class TestEmptyOp(OpTest):
def setUp(self):
self.op_type = "empty"
self.init_config()
def test_check_output(self):
self.check_output_customized(self.verify_output)
def verify_output(self, outs):
data_type = outs[0].dtype
if data_type in ['float32', 'float64', 'int32', 'int64']:
max_value = np.nanmax(outs[0])
min_value = np.nanmin(outs[0])
always_full_zero = max_value == 0.0 and min_value == 0.0
always_non_full_zero = max_value > min_value
self.assertTrue(always_full_zero or always_non_full_zero,
'always_full_zero or always_non_full_zero.')
elif data_type in ['bool']:
total_num = outs[0].size
true_num = np.sum(outs[0] == True)
false_num = np.sum(outs[0] == False)
self.assertTrue(total_num == true_num + false_num,
'The value should always be True or False.')
else:
self.assertTrue(False, 'invalid data type')
def init_config(self):
shape = [500, 3]
dtype = 'float32'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
self.attrs = {'shape': shape, 'dtype': dtype_inner}
self.inputs = {}
self.outputs = {'Out': np.zeros(shape).astype(dtype)}
class TestEmptyOp2(TestEmptyOp):
def init_config(self):
shape = [500, 3]
dtype = 'float64'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
self.attrs = {'shape': shape, 'dtype': dtype_inner}
self.inputs = {}
self.outputs = {'Out': np.zeros(shape).astype(dtype)}
class TestEmptyOp3(TestEmptyOp):
def init_config(self):
shape = [500, 3]
dtype = 'int32'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
self.attrs = {'shape': shape, 'dtype': dtype_inner}
self.inputs = {}
self.outputs = {'Out': np.zeros(shape).astype(dtype)}
class TestEmptyOp4(TestEmptyOp):
def init_config(self):
shape = [500, 3]
dtype = 'int64'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
self.attrs = {'shape': shape, 'dtype': dtype_inner}
self.inputs = {}
self.outputs = {'Out': np.zeros(shape).astype(dtype)}
class TestEmptyOp5(TestEmptyOp):
def init_config(self):
shape = [500, 3]
dtype = 'bool'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
self.attrs = {'shape': shape, 'dtype': dtype_inner}
self.inputs = {}
self.outputs = {'Out': np.zeros(shape).astype(dtype)}
# Situation 2: shape is a tensor
class TestEmptyOp_ShapeTensor(OpTest):
def setUp(self):
self.op_type = "empty"
self.init_config()
def init_config(self):
self.shape = [500, 3]
dtype = 'float32'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
self.attrs = {'dtype': dtype_inner}
self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
def test_check_output(self):
self.check_output_customized(self.verify_output)
def verify_output(self, outs):
data_type = outs[0].dtype
if data_type in ['float32', 'float64', 'int32', 'int64']:
max_value = np.nanmax(outs[0])
min_value = np.nanmin(outs[0])
always_full_zero = max_value == 0.0 and min_value == 0.0
always_non_full_zero = max_value > min_value
self.assertTrue(always_full_zero or always_non_full_zero,
'always_full_zero or always_non_full_zero.')
elif data_type in ['bool']:
total_num = outs[0].size
true_num = np.sum(outs[0] == True)
false_num = np.sum(outs[0] == False)
self.assertTrue(total_num == true_num + false_num,
'The value should always be True or False.')
else:
self.assertTrue(False, 'invalid data type')
# Situation 3: Attr(shape) is a list(with tensor)
class TestEmptyOp_ShapeTensorList(OpTest):
def setUp(self):
self.op_type = "empty"
self.init_config()
def init_config(self):
self.shape = [123, 92]
self.infer_shape = [-1, 92]
dtype = 'float32'
dtype_inner = convert_np_dtype_to_dtype_(dtype)
shape_tensor_list = []
for index, ele in enumerate(self.shape):
shape_tensor_list.append(("x" + str(index), np.ones(
(1)).astype('int32') * ele))
self.inputs = {"ShapeTensorList": shape_tensor_list}
self.attrs = {'shape': self.infer_shape, 'dtype': dtype_inner}
self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
def test_check_output(self):
self.check_output_customized(self.verify_output)
def verify_output(self, outs):
data_type = outs[0].dtype
if data_type in ['float32', 'float64', 'int32', 'int64']:
max_value = np.nanmax(outs[0])
min_value = np.nanmin(outs[0])
always_full_zero = max_value == 0.0 and min_value == 0.0
always_non_full_zero = max_value > min_value
self.assertTrue(always_full_zero or always_non_full_zero,
'always_full_zero or always_non_full_zero.')
elif data_type in ['bool']:
total_num = outs[0].size
true_num = np.sum(outs[0] == True)
false_num = np.sum(outs[0] == False)
self.assertTrue(total_num == true_num + false_num,
'The value should always be True or False.')
else:
self.assertTrue(False, 'invalid data type')
class TestEmptyAPI(unittest.TestCase):
def __check_out__(self, out, dtype='float32'):
max_value = np.nanmax(np.array(out))
min_value = np.nanmin(np.array(out))
always_non_full_zero = max_value > min_value
always_full_zero = max_value == 0.0 and min_value == 0.0
self.assertTrue(always_full_zero or always_non_full_zero,
'always_full_zero or always_non_full_zero.')
def test_dygraph_api_out(self):
paddle.disable_static()
shape = [200, 3]
out = paddle.empty(shape=shape)
self.__check_out__(out)
paddle.enable_static()
def test_dygraph_api_out_2(self):
paddle.disable_static()
shape_data = np.array([200, 3]).astype('int32')
shape = paddle.to_tensor(shape_data)
out = paddle.empty(shape=shape)
self.__check_out__(out)
paddle.enable_static()
def test_dygraph_api_out_3(self):
paddle.disable_static()
shape_data = np.array([200, 3]).astype('int64')
shape = paddle.to_tensor(shape_data)
out = paddle.empty(shape=shape)
self.__check_out__(out)
paddle.enable_static()
def test_dygraph_api_attr(self):
paddle.disable_static()
shape = [200, 3]
dtype = 'float64'
out = paddle.empty(shape=shape, dtype=dtype)
self.__check_out__(out, dtype)
paddle.enable_static()
def test_static_graph(self):
dtype = 'float64'
positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3)
positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3)
shape_tensor_int32 = fluid.data(
name="shape_tensor_int32", shape=[2], dtype="int32")
shape_tensor_int64 = fluid.data(
name="shape_tensor_int64", shape=[2], dtype="int64")
out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
res_1, res_2, res_3, res_4, res_5 = exe.run(
fluid.default_main_program(),
feed={
"shape_tensor_int32": np.array([200, 3]).astype("int32"),
"shape_tensor_int64": np.array([200, 3]).astype("int64"),
},
fetch_list=[out_1, out_2, out_3, out_4, out_5])
self.__check_out__(res_1, dtype)
self.__check_out__(res_2, dtype)
self.__check_out__(res_3, dtype)
self.__check_out__(res_4, dtype)
self.__check_out__(res_5, dtype)
class TestEmptyError(unittest.TestCase):
def test_attr(self):
def test_dtype():
shape = [200, 3]
dtype = 'uint8'
result = paddle.empty(shape=shape, dtype=dtype)
self.assertRaises(TypeError, test_dtype)
if __name__ == '__main__':
unittest.main()
......@@ -28,7 +28,7 @@ class TestFetchUnmerged(unittest.TestCase):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
num_filters=8,
pool_size=2,
pool_stride=2,
pool_type='max',
......@@ -37,12 +37,12 @@ class TestFetchUnmerged(unittest.TestCase):
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
num_filters=16,
pool_size=2,
pool_stride=2,
pool_type='avg',
act="relu")
hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu')
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
......@@ -75,8 +75,8 @@ class TestFetchUnmerged(unittest.TestCase):
binary = fluid.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
iters = 3
batch_size = 64
iters = 2
batch_size = 16
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
......
......@@ -57,7 +57,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('cast', ops)
self.assertIn('isfinite', ops)
self.assertIn('check_finite_and_unscale', ops)
if __name__ == "__main__":
......
......@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase):
def test_localsgd_configs(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
configs = {"k_steps": 4}
configs = {"k_steps": 4, "begin_step": 120}
strategy.localsgd_configs = configs
self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
def test_dgc(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
......@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase):
strategy.a_sync = True
strategy.localsgd = True
strategy.dgc = True
localsgd_configs = {"k_steps": 5}
localsgd_configs = {"k_steps": 5, "begin_step": 1}
strategy.localsgd_configs = localsgd_configs
build_strategy = paddle.fluid.BuildStrategy()
build_strategy.enable_sequential_execution = True
......@@ -316,6 +317,14 @@ class TestStrategyConfig(unittest.TestCase):
self.assertEqual(strategy.conv_workspace_size_limit, 1000)
strategy._enable_env()
def test_distributed_strategy_repr(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["a1", "a2", "a3"]}
strategy.amp = True
strategy.localsgd = True
print(str(strategy))
if __name__ == '__main__':
unittest.main()
......@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
class TestFleetLambMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
......@@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops_with_bias = [
ops_without_wd = [
op for op in avg_cost.block.ops
if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
]
for op in ops_with_bias:
for op in ops_without_wd:
self.assertEqual(op.attr('weight_decay'), 0)
def test_lamb_apply_with_amp(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
strategy.lamb = True
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('lamb', ops)
self.assertIn('cast', ops)
self.assertIn('isfinite', ops)
if __name__ == "__main__":
unittest.main()
......@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
class TestFleetLarsMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
......@@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
strategy.lars_configs = {
"lars_coeff": 0.001,
"lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ["batch_norm", ".b"],
}
return avg_cost, strategy
......@@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('lars_momentum', ops)
def test_lars_exclude_fn(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops_without_wd = [
op for op in avg_cost.block.ops
if op.type == 'lars_momentum' and ("batch_norm" in op.attr(
'op_role_var')[0] or ".b" in op.attr('op_role_var')[0])
]
for op in ops_without_wd:
self.assertEqual(op.attr('lars_weight_decay'), 0)
def test_lars_apply_with_amp(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
strategy.lars = True
strategy.lars_configs = {
"lars_coeff": 0.001,
"lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ["batch_norm", ".b"],
}
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('lars_momentum', ops)
self.assertIn('cast', ops)
self.assertIn('isfinite', ops)
if __name__ == "__main__":
unittest.main()
......@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
strategy.auto = True
config = strategy.localsgd_configs
config['k_steps'] = 1
config['begin_step'] = 1
strategy.localsgd_configs = config
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
......
......@@ -25,7 +25,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
hidden1 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
num_filters=16,
stride=1,
padding=1,
act=None,
......@@ -43,7 +43,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
bias_attr=bias_attr,
act='relu',
data_layout='NHWC')
hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
hidden4 = fluid.layers.batch_norm(
input=hidden3, act='relu', data_layout='NHWC')
prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
......@@ -63,7 +63,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
startup_program = fluid.Program()
x, y, loss = self.build_program(main_program, startup_program, use_cuda)
exe = fluid.Executor(place)
iters = 10
iters = 8
batch_size = 16
feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
......
......@@ -18,6 +18,7 @@ import multiprocessing
import numpy as np
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.reader import _reader_process_loop
if sys.version_info[0] == 2:
import Queue as queue
......@@ -66,7 +67,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
batch_generator_creator(self.batch_size, self.batch_num),
places=fluid.CPUPlace())
loader._data_queue = queue.Queue(self.batch_num + 1)
loader._reader_process_loop()
_reader_process_loop(loader._batch_reader, loader._data_queue)
# For clean memory mapped files
util_queue = multiprocessing.Queue(self.batch_num + 1)
for _ in range(self.batch_num):
......@@ -94,7 +95,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
loader._data_queue = queue.Queue(self.batch_num + 1)
exception = None
try:
loader._reader_process_loop()
_reader_process_loop(loader._batch_reader, loader._data_queue)
except core.EnforceNotMet as ex:
exception = ex
self.assertIsNotNone(exception)
......
......@@ -15,18 +15,26 @@
import unittest
import numpy as np
import paddle.fluid as fluid
import warnings
class TestImperativeNumpyBridge(unittest.TestCase):
def test_tensor_from_numpy(self):
data_np = np.array([[2, 3, 1]]).astype('float32')
with fluid.dygraph.guard(fluid.CPUPlace()):
var = fluid.dygraph.to_variable(data_np, zero_copy=True)
self.assertTrue(np.array_equal(var.numpy(), data_np))
data_np[0][0] = 4
self.assertEqual(data_np[0][0], 4)
self.assertEqual(var[0][0].numpy()[0], 4)
self.assertTrue(np.array_equal(var.numpy(), data_np))
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
var = fluid.dygraph.to_variable(data_np, zero_copy=True)
assert "Currently, zero_copy is not supported, and it will be discarded." in str(
w[-1].message)
# Temporally diable zero_copy
# var = fluid.dygraph.to_variable(data_np, zero_copy=True)
# self.assertTrue(np.array_equal(var.numpy(), data_np))
# data_np[0][0] = 4
# self.assertEqual(data_np[0][0], 4)
# self.assertEqual(var[0][0].numpy()[0], 4)
# self.assertTrue(np.array_equal(var.numpy(), data_np))
var2 = fluid.dygraph.to_variable(data_np, zero_copy=False)
self.assertTrue(np.array_equal(var2.numpy(), data_np))
data_np[0][0] = -1
......
......@@ -23,7 +23,7 @@ from paddle.static import InputSpec
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph import declarative, ProgramTranslator
from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
BATCH_SIZE = 32
BATCH_NUM = 10
......@@ -56,6 +56,16 @@ class LinearNet(fluid.dygraph.Layer):
return self._linear(x)
class LinearNetWithInputSpec(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetWithInputSpec, self).__init__()
self._linear = Linear(in_size, out_size)
@declarative(input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
def forward(self, x):
return self._linear(x)
class LinearNetNotDeclarative(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetNotDeclarative, self).__init__()
......@@ -65,6 +75,23 @@ class LinearNetNotDeclarative(fluid.dygraph.Layer):
return self._linear(x)
class LinerNetWithLabel(paddle.nn.Layer):
def __init__(self, in_size, out_size):
super(LinerNetWithLabel, self).__init__()
self._linear = Linear(in_size, out_size)
@declarative(input_spec=[
InputSpec(
shape=[None, 784], dtype='float32', name="image"), InputSpec(
shape=[None, 1], dtype='int64', name="label")
])
def forward(self, x, label):
out = self._linear(x)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.mean(loss)
return out, avg_loss
class LinearNetReturnLoss(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetReturnLoss, self).__init__()
......@@ -78,6 +105,72 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
return z, loss
class LinearNetMultiInput(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetMultiInput, self).__init__()
self._linear1 = Linear(in_size, out_size)
self._linear2 = Linear(in_size, out_size)
@declarative(input_spec=[
InputSpec(
[None, 8], dtype='float32'), InputSpec(
[None, 8], dtype='float32')
])
def forward(self, x, y):
x_out = self._linear1(x)
y_out = self._linear2(y)
loss = fluid.layers.mean(x_out + y_out)
return x_out, y_out, loss
class MultiLoadingLinearNet(fluid.dygraph.Layer):
def __init__(self, size, model_path):
super(MultiLoadingLinearNet, self).__init__()
self._linear = Linear(size, size)
self._load_linear1 = fluid.dygraph.jit.load(model_path)
self._load_linear2 = fluid.dygraph.jit.load(model_path)
@declarative
def forward(self, x):
tmp1 = self._linear(x)
tmp2 = self._load_linear1(tmp1)
tmp3 = self._load_linear2(tmp2)
y = self._linear(tmp3)
return y
class LinearNetReturnHidden(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetReturnHidden, self).__init__()
self._linear_1 = Linear(in_size, out_size)
self._linear_2 = Linear(in_size, out_size)
@declarative
def forward(self, x):
y = self._linear_1(x)
z = self._linear_2(y)
loss = fluid.layers.mean(z)
return y, loss
class EmptyLayer(paddle.nn.Layer):
def __init__(self):
super(EmptyLayer, self).__init__()
@paddle.jit.to_static
def forward(self, x):
return x
class NoParamLayer(paddle.nn.Layer):
def __init__(self):
super(NoParamLayer, self).__init__()
@paddle.jit.to_static
def forward(self, x, y):
return x + y
def train(layer, input_size=784, label_size=1):
# create optimizer
sgd = fluid.optimizer.SGDOptimizer(
......@@ -102,6 +195,27 @@ def train(layer, input_size=784, label_size=1):
return [img], layer, avg_loss
def train_with_label(layer, input_size=784, label_size=1):
# create optimizer
sgd = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=layer.parameters())
# create data loader
train_loader = fluid.io.DataLoader.from_generator(capacity=5)
train_loader.set_batch_generator(
random_batch_reader(input_size, label_size))
# train
for data in train_loader():
img, label = data
label.stop_gradient = True
out, avg_loss = layer(img, label)
avg_loss.backward()
sgd.minimize(avg_loss)
layer.clear_gradients()
return out
class TestJitSaveLoad(unittest.TestCase):
def setUp(self):
self.model_path = "model.test_jit_save_load"
......@@ -159,8 +273,11 @@ class TestJitSaveLoad(unittest.TestCase):
train_layer.eval()
# construct new model
new_layer = LinearNet(784, 1)
model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
new_layer.set_dict(model_dict)
orig_state_dict = new_layer.state_dict()
load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
for structured_name in orig_state_dict:
self.assertTrue(structured_name in load_state_dict)
new_layer.set_state_dict(load_state_dict)
new_layer.eval()
# inference & compare
x = fluid.dygraph.to_variable(
......@@ -168,38 +285,20 @@ class TestJitSaveLoad(unittest.TestCase):
self.assertTrue(
np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
def test_save_get_program_failed(self):
layer = LinearNetNotDeclarative(784, 1)
example_inputs, layer, _ = train(layer)
with self.assertRaises(RuntimeError):
fluid.dygraph.jit.save(
layer=layer,
model_path=self.model_path,
input_spec=example_inputs)
def test_load_dygraph_no_path(self):
model_path = "model.test_jit_save_load.no_path"
new_layer = LinearNet(784, 1)
with self.assertRaises(ValueError):
model_dict, _ = fluid.dygraph.load_dygraph(model_path)
class LinearNetMultiInput(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetMultiInput, self).__init__()
self._linear1 = Linear(in_size, out_size)
# self._linear2 = Linear(in_size, out_size)
@declarative(input_spec=[
InputSpec(
[None, 8], dtype='float32'), InputSpec(
[None, 8], dtype='float32')
])
def forward(self, x, y):
x_out = self._linear1(x)
y_out = self._linear1(y)
loss = fluid.layers.mean(x_out + y_out)
return x_out, y_out, loss
def test_jit_load_model_incomplete(self):
model_path = "model.test_jit_save_load.remove_variables"
self.train_and_save_model(model_path=model_path)
# remove `__variables__`
var_path = os.path.join(model_path, VARIABLE_FILENAME)
os.remove(var_path)
with self.assertRaises(ValueError):
paddle.jit.load(model_path)
class TestSaveLoadWithInputSpec(unittest.TestCase):
......@@ -345,22 +444,6 @@ class TestJitSaveLoadConfig(unittest.TestCase):
np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
class MultiLoadingLinearNet(fluid.dygraph.Layer):
def __init__(self, size, model_path):
super(MultiLoadingLinearNet, self).__init__()
self._linear = Linear(size, size)
self._load_linear1 = fluid.dygraph.jit.load(model_path)
self._load_linear2 = fluid.dygraph.jit.load(model_path)
@declarative
def forward(self, x):
tmp1 = self._linear(x)
tmp2 = self._load_linear1(tmp1)
tmp3 = self._load_linear2(tmp2)
y = self._linear(tmp3)
return y
class TestJitMultipleLoading(unittest.TestCase):
def setUp(self):
self.linear_size = 4
......@@ -389,20 +472,6 @@ class TestJitMultipleLoading(unittest.TestCase):
name_set.add(var.name)
class LinearNetReturnHidden(fluid.dygraph.Layer):
def __init__(self, in_size, out_size):
super(LinearNetReturnHidden, self).__init__()
self._linear_1 = Linear(in_size, out_size)
self._linear_2 = Linear(in_size, out_size)
@declarative
def forward(self, x):
y = self._linear_1(x)
z = self._linear_2(y)
loss = fluid.layers.mean(z)
return y, loss
class TestJitPruneModelAndLoad(unittest.TestCase):
def setUp(self):
self.linear_size = 4
......@@ -461,5 +530,230 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
fluid.dygraph.jit.load(self.model_path)
class TestJitSaveMultiCases(unittest.TestCase):
def setUp(self):
# enable dygraph mode
fluid.enable_dygraph()
# config seed
paddle.manual_seed(SEED)
paddle.framework.random._manual_program_seed(SEED)
def verify_inference_correctness(self, layer, model_path, with_label=False):
layer.eval()
loaded_layer = paddle.jit.load(model_path)
loaded_layer.eval()
# inference & compare
x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
if with_label:
y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
pred, _ = layer(x, y)
pred = pred.numpy()
else:
pred = layer(x).numpy()
loaded_pred = loaded_layer(x).numpy()
self.assertTrue(
np.array_equal(pred, loaded_pred),
msg="Result diff when load and inference:\nlayer result:\n{}\n" \
"loaded layer result:\n{}".format(pred, loaded_pred))
def test_no_prune_to_static_after_train(self):
layer = LinearNet(784, 1)
train(layer)
model_path = "test_no_prune_to_static_after_train"
paddle.jit.save(layer, model_path)
self.verify_inference_correctness(layer, model_path)
def test_no_prune_to_static_no_train(self):
layer = LinearNetWithInputSpec(784, 1)
model_path = "test_no_prune_to_static_no_train"
paddle.jit.save(layer, model_path)
self.verify_inference_correctness(layer, model_path)
def test_no_prune_no_to_static_after_train(self):
layer = LinearNetNotDeclarative(784, 1)
train(layer)
model_path = "test_no_prune_no_to_static_after_train"
paddle.jit.save(
layer,
model_path,
input_spec=[InputSpec(
shape=[None, 784], dtype='float32')])
self.verify_inference_correctness(layer, model_path)
def test_no_prune_no_to_static_after_train_with_examples(self):
layer = LinearNetNotDeclarative(784, 1)
example_inputs, _, _ = train(layer)
model_path = "test_no_prune_no_to_static_after_train_with_examples"
fluid.dygraph.jit.save(
layer=layer, model_path=model_path, input_spec=example_inputs)
self.verify_inference_correctness(layer, model_path)
def test_no_prune_no_to_static_no_train(self):
layer = LinearNetNotDeclarative(784, 1)
model_path = "test_no_prune_no_to_static_no_train"
paddle.jit.save(
layer,
model_path,
input_spec=[InputSpec(
shape=[None, 784], dtype='float32')])
self.verify_inference_correctness(layer, model_path)
def test_prune_to_static_after_train(self):
layer = LinerNetWithLabel(784, 1)
out = train_with_label(layer)
model_path = "test_prune_to_static_after_train"
configs = paddle.SaveLoadConfig()
configs.output_spec = [out]
paddle.jit.save(
layer,
model_path,
input_spec=[
InputSpec(
shape=[None, 784], dtype='float32', name="image")
],
configs=configs)
self.verify_inference_correctness(layer, model_path, True)
def test_prune_to_static_no_train(self):
layer = LinerNetWithLabel(784, 1)
model_path = "test_prune_to_static_no_train"
configs = paddle.SaveLoadConfig()
# TODO: no train, cannot get output_spec var here
# now only can use index
configs.output_spec = layer.forward.outputs[:1]
paddle.jit.save(
layer,
model_path,
input_spec=[
InputSpec(
shape=[None, 784], dtype='float32', name="image")
],
configs=configs)
self.verify_inference_correctness(layer, model_path, True)
def test_no_prune_input_spec_name_warning(self):
layer = LinearNetWithInputSpec(784, 1)
train(layer)
model_path = "test_no_prune_input_spec_name_warning"
paddle.jit.save(
layer,
model_path,
input_spec=[InputSpec(
shape=[None, 784], dtype='float32')])
paddle.jit.save(
layer,
model_path,
input_spec=[
InputSpec(
shape=[None, 784], dtype='float32', name='feed_input')
])
self.verify_inference_correctness(layer, model_path)
def test_not_prune_output_spec_name_warning(self):
layer = LinearNet(784, 1)
train(layer)
model_path = "test_not_prune_output_spec_name_warning"
configs = paddle.SaveLoadConfig()
out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
configs.output_spec = [out]
paddle.jit.save(layer, model_path, configs=configs)
self.verify_inference_correctness(layer, model_path)
def test_prune_input_spec_name_error(self):
layer = LinerNetWithLabel(784, 1)
model_path = "test_prune_input_spec_name_error"
with self.assertRaises(ValueError):
paddle.jit.save(
layer,
model_path,
input_spec=[InputSpec(
shape=[None, 784], dtype='float32')])
with self.assertRaises(ValueError):
paddle.jit.save(
layer,
model_path,
input_spec=[
InputSpec(
shape=[None, 784], dtype='float32', name='feed_input')
])
def test_prune_output_spec_name_error(self):
layer = LinerNetWithLabel(784, 1)
train_with_label(layer)
model_path = "test_prune_to_static_after_train"
configs = paddle.SaveLoadConfig()
out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
configs.output_spec = [out]
with self.assertRaises(ValueError):
paddle.jit.save(
layer,
model_path,
input_spec=[
InputSpec(
shape=[None, 784], dtype='float32', name="image")
],
configs=configs)
class TestJitSaveLoadEmptyLayer(unittest.TestCase):
def setUp(self):
self.model_path = "model.jit_save_load_empty_layer"
# enable dygraph mode
paddle.disable_static()
def test_save_load_empty_layer(self):
layer = EmptyLayer()
x = paddle.to_variable(np.random.random((10)).astype('float32'))
out = layer(x)
paddle.jit.save(layer, self.model_path)
load_layer = paddle.jit.load(self.model_path)
load_out = load_layer(x)
self.assertTrue(np.array_equal(out, load_out))
class TestJitSaveLoadNoParamLayer(unittest.TestCase):
def setUp(self):
self.model_path = "model.jit_save_load_no_param_layer"
# enable dygraph mode
paddle.disable_static()
def test_save_load_no_param_layer(self):
layer = NoParamLayer()
x = paddle.to_variable(np.random.random((5)).astype('float32'))
y = paddle.to_variable(np.random.random((5)).astype('float32'))
out = layer(x, y)
paddle.jit.save(layer, self.model_path)
load_layer = paddle.jit.load(self.model_path)
load_out = load_layer(x, y)
self.assertTrue(np.array_equal(out, load_out))
if __name__ == '__main__':
unittest.main()
......@@ -46,8 +46,8 @@ class TestLogsumexp(OpTest):
self.inputs = {'X': x}
self.outputs = {'Out': out}
self.attrs = {
'dim': self.axis,
'keep_dim': self.keepdim,
'axis': self.axis,
'keepdim': self.keepdim,
'reduce_all': self.reduce_all
}
......
......@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
@prog_scope()
def test_integer_div(self):
a = fluid.layers.data(name="a", shape=[1], dtype='int64')
b = a / 2
b = a / 7
place = fluid.CPUPlace()
exe = fluid.Executor(place)
a_np = numpy.array([3, 4, 10, 14, 9, 18])
a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
b_np, = exe.run(fluid.default_main_program(),
feed={"a": a_np},
fetch_list=[b])
# for paddle2.0, use true_divide
b_np_actual = (a_np / 2.0)
b_np_actual = (a_np / 7).astype('int64')
self.assertTrue(numpy.array_equal(b_np, b_np_actual))
@prog_scope()
......
......@@ -27,6 +27,7 @@ import paddle.fluid.core as core
from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dataloader.dataloader_iter import _worker_loop
class RandomDataset(Dataset):
......@@ -185,9 +186,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
for i in range(10):
indices_queue.put([i, i + 10])
indices_queue.put(None)
loader._worker_loop(
loader._dataset, 0, indices_queue, loader._data_queue,
loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
_worker_loop(loader._dataset, 0, indices_queue,
loader._data_queue, loader._workers_done_event,
_collate_fn, _init_fn, 0, 1,
loader._use_shared_memory)
self.assertTrue(False)
except AssertionError:
pass
......@@ -228,9 +230,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
indices_queue.put([i, i + 10])
indices_queue.put(None)
loader._workers_done_event.set()
loader._worker_loop(
loader._dataset, 0, indices_queue, loader._data_queue,
loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
_worker_loop(loader._dataset, 0, indices_queue,
loader._data_queue, loader._workers_done_event,
_collate_fn, _init_fn, 0, 1,
loader._use_shared_memory)
self.assertTrue(True)
except AssertionError:
pass
......
......@@ -17,6 +17,7 @@ from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core
......@@ -153,6 +154,30 @@ class TestMulDoubleGradCheck(unittest.TestCase):
class TestReshapeDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
x_shape = [3, 12]
expand_times = [4, 9]
eps = 0.005
dtype = np.float64
x = layers.data('x', x_shape, False, dtype)
x.persistable = True
out = layers.expand(x, expand_times)
x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
gradient_checker.double_grad_check(
[x], out, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
class TestExpandDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
x_shape = [3, 12]
......@@ -176,5 +201,53 @@ class TestReshapeDoubleGradCheck(unittest.TestCase):
self.func(p)
class TestTileDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
x_shape = [3, 12]
repeat_times = [4, 9]
eps = 0.005
dtype = np.float64
x = layers.data('x', x_shape, False, dtype)
x.persistable = True
out = paddle.tile(x, repeat_times)
x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
gradient_checker.double_grad_check(
[x], out, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
class TestExpandV2DoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
x_shape = [1, 12]
new_shape = [4, 12]
eps = 0.005
dtype = np.float64
x = layers.data('x', x_shape, False, dtype)
x.persistable = True
out = paddle.expand(x, new_shape)
x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
gradient_checker.double_grad_check(
[x], out, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
if __name__ == "__main__":
unittest.main()
......@@ -261,7 +261,13 @@ class TestMultiOptimizersMultiCardsError(unittest.TestCase):
exe.run(startup_program)
np.random.seed(SEED)
# NOTE(liym27):
# This test needs to run in multi cards to test NotImplementedError.
# Here, move this test from RUN_TYPE=DIST in tests/unittests/CMakeList.txt,
# to use multi cards ** only on CPU ** not GPU to reduce CI time.
os.environ['CPU_NUM'] = str(2)
pe_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=main_program,
......
......@@ -176,7 +176,7 @@ class TestCRFModel(unittest.TestCase):
place=fluid.CPUPlace())
data = train_data()
for i in range(10):
for i in range(4):
cur_batch = next(data)
print(exe.run(train_cp,
feed=feeder.feed(cur_batch),
......
......@@ -248,8 +248,7 @@ class PolicyGradient(object):
func=reward_func, x=[action, length], out=reward)
neg_log_prob = layers.cross_entropy(act_prob, action)
cost = neg_log_prob * reward
cost = (layers.reduce_sum(cost) /
layers.cast(layers.reduce_sum(length), "float32")
cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
) if length is not None else layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
......
......@@ -167,8 +167,9 @@ class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
with fluid.dygraph.guard():
input1 = np.random.random([5, 10]).astype("int32")
out1 = np.expand_dims(input1, axis=1)
out1 = np.expand_dims(out1, axis=2)
input = fluid.dygraph.to_variable(input1)
output = paddle.unsqueeze(input, axis=paddle.to_tensor([1]))
output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2]))
out_np = output.numpy()
self.assertTrue(np.array_equal(out1, out_np))
self.assertEqual(out1.shape, out_np.shape)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid as fluid
import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
class TestUpdateLossScalingOp(OpTest):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([False], dtype=np.bool)
x = np.random.random((1024, 1024)).astype(self.dtype)
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def init(self):
self.incr_ratio = 2.0
self.decr_ratio = 0.8
self.dtype = np.float32
self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
self.num_good_steps = np.array([999], dtype=np.int32)
self.num_bad_steps = np.array([1], dtype=np.int32)
self.zero_steps = np.array([0], dtype=np.int32)
self.attrs = {
'incr_every_n_steps': 1000,
'decr_every_n_nan_or_inf': 2,
'incr_ratio': self.incr_ratio,
'decr_ratio': self.decr_ratio,
}
def test_check_output(self):
self.check_output(no_check_set=['Out'])
class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([True], dtype=np.bool)
x = np.random.random((1024, 1024)).astype(self.dtype)
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
x[i[0]][j[0]] = np.inf
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'LossScaling': self.prev_loss_scaling * self.decr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def test_check_output(self):
self.check_output()
class TestUpdateLossScalingLayer(unittest.TestCase):
def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(
name="prev_loss_scaling", shape=[1], dtype='float32')
num_good_steps = fluid.data(
name="num_good_steps", shape=[1], dtype='int32')
num_bad_steps = fluid.data(
name="num_bad_steps", shape=[1], dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
found_inf_v = np.array([False]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(
x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], a_v)
assert np.array_equal(result_v[1], b_v)
assert np.array_equal(result_v[0], result_v[2])
assert np.array_equal(result_v[1], result_v[3])
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(
name="prev_loss_scaling", shape=[1], dtype='float32')
num_good_steps = fluid.data(
name="num_good_steps", shape=[1], dtype='int32')
num_bad_steps = fluid.data(
name="num_bad_steps", shape=[1], dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
a_v[i[0]][j[0]] = np.inf
found_inf_v = np.array([True]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(
x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], np.zeros_like(a_v))
assert np.array_equal(result_v[1], np.zeros_like(b_v))
assert np.array_equal(result_v[2], np.zeros_like(a_v))
assert np.array_equal(result_v[3], np.zeros_like(b_v))
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def test_loss_scaling_cpu(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check(use_cuda=False)
def test_loss_scaling_cpu_inf(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check_inf(use_cuda=False)
def test_loss_scaling_gpu(self):
if fluid.core.is_compiled_with_cuda():
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check(use_cuda=True)
def test_loss_scaling_gpu_inf(self):
if fluid.core.is_compiled_with_cuda():
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check_inf(use_cuda=True)
if __name__ == '__main__':
unittest.main()
......@@ -25,6 +25,7 @@ no_check_set_white_list = [
'unsqueeze2',
'cross_entropy2',
'seed',
'amp_check_finite_and_scale',
'check_finite_and_unscale',
'update_loss_scaling',
'cudnn_lstm',
]
......@@ -450,7 +450,7 @@ def interpolate(x,
for i in range(len(x.shape) - 2):
scale_list.append(scale)
attrs['scale'] = list(map(float, scale_list))
elif isinstance(scale, list) or isinstance(scale, float):
elif isinstance(scale, list) or isinstance(scale, tuple):
if len(scale) != len(x.shape) - 2:
raise ValueError("scale_shape length should be {} for "
"input {}-D tensor.".format(
......
......@@ -1009,8 +1009,7 @@ def ctc_loss(log_probs,
loss_out = fluid.layers.squeeze(loss_out, [-1])
assert reduction in ['mean', 'sum', 'none']
if reduction == 'mean':
loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
loss_out.dtype))
loss_out = paddle.mean(loss_out / label_lengths)
elif reduction == 'sum':
loss_out = paddle.sum(loss_out)
return loss_out
......
......@@ -250,3 +250,47 @@ class Adam(Optimizer):
stop_gradient=True)
return adam_op
@framework.dygraph_only
def step(self):
"""
Execute the optimizer and update parameters once.
Returns:
None
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_grad()
"""
parameter_list = self._parameter_list
self._dtype = None
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if hasattr(
param, "_is_sparse"
) and param._is_sparse and self.regularization is not None:
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
......@@ -40,6 +40,7 @@ from .creation import full_like #DEFINE_ALIAS
from .creation import triu #DEFINE_ALIAS
from .creation import tril #DEFINE_ALIAS
from .creation import meshgrid #DEFINE_ALIAS
from .creation import empty #DEFINE_ALIAS
from .io import save #DEFINE_ALIAS
from .io import load #DEFINE_ALIAS
from .linalg import matmul #DEFINE_ALIAS
......
......@@ -48,6 +48,7 @@ __all__ = [
'eye',
'full',
'full_like',
'empty',
'triu',
'tril',
'meshgrid'
......@@ -62,8 +63,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy
will be performed and return origin tensor, otherwise a new tensor will be constructed
and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype``
and the current place is cpu, no copy will be performed.
and returned.
The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then
``x.real`` is the real part, and ``x.imag`` is the imaginary part.
......@@ -208,20 +208,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
value=data,
place=place,
persistable=False,
zero_copy=True,
zero_copy=False,
stop_gradient=stop_gradient)
else:
name = unique_name.generate('generated_tensor')
real_tensor = paddle.Tensor(
value=data.real,
place=place,
zero_copy=True,
zero_copy=False,
name=name + ".real",
stop_gradient=stop_gradient)
imag_tensor = paddle.Tensor(
value=data.imag,
place=place,
zero_copy=True,
zero_copy=False,
name=name + ".imag",
stop_gradient=stop_gradient)
return paddle.ComplexTensor(real_tensor, imag_tensor)
......@@ -981,3 +981,90 @@ def diag(x, offset=0, padding_value=0, name=None):
out.stop_gradient = True
return out
def empty(shape, dtype=None, name=None):
"""
This Op returns a Tensor with uninitialized data which size is same as ``shape``.
Args:
shape(list|tuple|Tensor): Shape of the Tensor to be created.
The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
the elements of it should be integers or Tensors with shape [1].
If ``shape`` is an Tensor, it should be an 1-D Tensor.
dtype(np.dtype|str, optional): Data type of the output Tensor
which can be bool, float16, float32, float64, int32, int64, if dytpe is `None`, the data
type of created Tensor use global default dtype (see ``get_default_dtype``
for details).
name(str, optional): The default value is None. Normally there is no need for user to set this
property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static() # Now we are in imperative mode
paddle.set_device("cpu") # and use cpu device
# example 1: argument ``shape`` is a list which doesn't contain Tensor.
data1 = paddle.empty(shape=[2,3], dtype='float32')
#[[4.3612203e+27 1.8176809e+31 1.3555911e-19] # uninitialized
# [1.1699684e-19 1.3563156e-19 3.6408321e-11]] # uninitialized
# example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32.
shape_data = np.array([2, 3]).astype('int32')
shape = paddle.to_tensor(shape_data)
data2 = paddle.empty(shape=shape, dtype='float32')
#[[1.7192326e-37 4.8125365e-38 1.9866003e-36] # uninitialized
# [1.3284029e-40 7.1117408e-37 2.5353012e+30]] # uninitialized
# example 3: argument ``shape`` is a list which contains Tensor.
dim2_data = np.array([3]).astype('int32')
dim2 = paddle.to_tensor(dim2_data)
data3 = paddle.empty(shape=[2, dim2], dtype='float32')
#[[1.1024214e+24 7.0379409e+22 6.5737699e-34] # uninitialized
# [7.5563101e+31 7.7130405e+31 2.8020654e+20]] # uninitialized
"""
if dtype is None:
dtype = paddle.get_default_dtype()
dtype = convert_dtype(dtype)
if in_dygraph_mode():
shape = utils.convert_shape_to_list(shape)
out = core.ops.empty('shape', shape, 'dtype',
convert_np_dtype_to_dtype_(dtype))
out.stop_gradient = True
return out
helper = LayerHelper("empty", **locals())
inputs = {}
check_dtype(dtype, 'dtype',
['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
'empty')
check_type(shape, 'shape', (Variable, list, tuple), 'empty')
if isinstance(shape, Variable):
check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
attrs = {}
utils.get_shape_tensor_inputs(
inputs=inputs, attrs=attrs, shape=shape, op_type='empty')
out = helper.create_variable_for_type_inference(dtype=dtype)
attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
helper.append_op(
type='empty',
inputs=inputs,
outputs={'Out': [out]},
attrs=attrs,
stop_gradient=True)
out.stop_gradient = True
return out
......@@ -64,7 +64,6 @@ from ..fluid.layers import increment #DEFINE_ALIAS
from ..fluid.layers import multiplex #DEFINE_ALIAS
from ..fluid.layers import sums #DEFINE_ALIAS
from ..fluid import layers
import paddle
__all__ = [
......@@ -343,69 +342,9 @@ def divide(x, y, name=None):
axis = -1
act = None
if in_dygraph_mode():
# rule 1 : avoid numpy.ndarray
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
# rule 2: both the inputs are not Tensor
elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
# rule 3: both the inputs are Tensor
elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
if y.dtype != x.dtype:
raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
"But x is {}, y is {}".format(x.dtype, y.dtype))
elif x.dtype in _supported_int_dtype_:
x = x.astype(paddle.get_default_dtype())
y = y.astype(paddle.get_default_dtype())
# rule 4: x is Tensor, y is scalar
elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
if x.dtype in _supported_int_dtype_:
x = x.astype(paddle.get_default_dtype())
y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
# rule 5: x is scalar, y is Tensor
elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
if y.dtype in _supported_int_dtype_:
y = y.astype(paddle.get_default_dtype())
x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type)
# rule 1 : avoid numpy.ndarray
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
# rule 2: both the inputs are not Tensor
elif not isinstance(x, Variable) and not isinstance(y, Variable):
x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
# rule 3: both the inputs are Tensor
elif isinstance(x, Variable) and isinstance(y, Variable):
if y.dtype != x.dtype:
raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
"But x is {}, y is {}".format(x.dtype, y.dtype))
elif x.dtype in _supported_int_dtype_:
x = paddle.cast(x, paddle.get_default_dtype())
y = paddle.cast(y, paddle.get_default_dtype())
# rule 4: x is Tensor, y is scalar
elif isinstance(x, Variable) and not isinstance(y, Variable):
if x.dtype in _supported_int_dtype_:
x = paddle.cast(x, paddle.get_default_dtype())
y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
# rule 5: x is scalar, y is Tensor
elif not isinstance(x, Variable) and isinstance(y, Variable):
if y.dtype in _supported_int_dtype_:
y = paddle.cast(y, paddle.get_default_dtype())
x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
return _elementwise_op(LayerHelper(op_type, **locals()))
......@@ -444,55 +383,9 @@ def floor_divide(x, y, name=None):
op_type = 'elementwise_floordiv'
axis = -1
if in_dygraph_mode():
# rule 1 : avoid numpy.ndarray
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
# rule 2: both the inputs are not Tensor
elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
# rule 3: both the inputs are Tensor
elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
if y.dtype != x.dtype:
raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
"But x is {}, y is {}".format(x.dtype, y.dtype))
# rule 4: x is Tensor, y is scalar
elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
# rule 5: x is scalar, y is Tensor
elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
return _elementwise_op_in_dygraph(
x, y, axis=axis, op_name=op_type)
# rule 1 : avoid numpy.ndarray
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
# rule 2: both the inputs are not Tensor
elif not isinstance(x, Variable) and not isinstance(y, Variable):
x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
# rule 3: both the inputs are Tensor
elif isinstance(x, Variable) and isinstance(y, Variable):
if y.dtype != x.dtype:
raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
"But x is {}, y is {}".format(x.dtype, y.dtype))
# rule 4: x is Tensor, y is scalar
elif isinstance(x, Variable) and not isinstance(y, Variable):
y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
# rule 5: x is scalar, y is Tensor
elif not isinstance(x, Variable) and isinstance(y, Variable):
x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
return _elementwise_op(LayerHelper(op_type, **locals()))
......@@ -531,43 +424,9 @@ def remainder(x, y, name=None):
op_type = 'elementwise_mod'
axis = -1
if in_dygraph_mode():
# rule 1 : avoid numpy.ndarray
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
elif not isinstance(x, paddle.Tensor):
raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
# rule 3: both the inputs are Tensor
elif isinstance(y, paddle.Tensor):
if y.dtype != x.dtype:
raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
"But x is {}, y is {}".format(x.dtype, y.dtype))
# rule 4: x is Tensor, y is scalar
elif not isinstance(y, paddle.Tensor):
y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
return _elementwise_op_in_dygraph(
x, y, axis=axis, op_name=op_type)
# rule 1 : avoid numpy.ndarray
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
elif not isinstance(x, Variable):
raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
# rule 3: both the inputs are Tensor
elif isinstance(y, Variable):
if y.dtype != x.dtype:
raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
"But x is {}, y is {}".format(x.dtype, y.dtype))
# rule 4: x is Tensor, y is scalar
elif not isinstance(y, paddle.Tensor):
y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
return _elementwise_op(LayerHelper(op_type, **locals()))
......@@ -1194,15 +1053,14 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
axis = [0]
if in_dygraph_mode():
return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
'reduce_all', reduce_all)
return core.ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
check_variable_and_dtype(x, 'x',
['float32', 'float64'],
'logsumexp')
helper = LayerHelper('logsumexp', **locals())
attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
attrs = {'axis': axis, 'keepdim': keepdim, 'reduce_all':reduce_all}
out = helper.create_variable_for_type_inference(x.dtype)
helper.append_op(
type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
......
......@@ -287,12 +287,19 @@ fi
pip install PyGithub
# For getting PR related data
wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
if [ "${HASUTFIXED}" != "" ]; then
echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
check_approval 1 45041955 22165420
fi
HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has benchmark issue to be fixed" || true`
if [ "${HASUTFIXED}" != "" ]; then
echo_line="${HASUTFIXED} You must have one RD (hysunflower or xiegegege or Xreki) approval.\n"
check_approval 1 52739577 46314656 12538138
fi
if [ -n "${echo_list}" ];then
echo "****************"
echo -e "${echo_list[@]}"
......
......@@ -27,9 +27,12 @@ class PRChecker(object):
self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
self.repo = None
def check(self):
""" check pr. """
filename = 'block.txt'
def check(self, filename, msg):
"""
Args:
filename (str): File to get block names.
msg (str): Error message.
"""
pr_id = os.getenv('GIT_PR_ID')
if not pr_id:
print('No PR ID')
......@@ -44,12 +47,10 @@ class PRChecker(object):
with open(filename) as f:
for l in f:
if l.rstrip('\r\n') == user:
print('{} has unit-test to be fixed, so CI failed.'.format(
user))
exit(1)
exit(0)
print('{} {}'.format(user, msg))
if __name__ == '__main__':
pr_checker = PRChecker()
pr_checker.check()
pr_checker.check('block.txt', 'has unit-test to be fixed, so CI failed.')
pr_checker.check('bk.txt', 'has benchmark issue to be fixed, so CI failed.')
......@@ -39,7 +39,7 @@
# Valid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 1706
# Invalid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 4572
ROOT_DIR=../paddle/fluid
ROOT_DIR=../../paddle/fluid
ALL_PADDLE_CHECK_CNT=0
VALID_PADDLE_CHECK_CNT=0
......
......@@ -59,7 +59,7 @@
. ./count_all_enforce.sh --source-only
ROOT_DIR=../paddle/fluid
ROOT_DIR=../../paddle/fluid
function count_dir_independently(){
local sub_dir_total_check_cnt=0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册