diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 6f4671c13a9e3dccb9be0a06f4bc2453af94bd55..7a94bda0f5f73e48081f68d7b2730e3df1e46232 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -16,7 +16,7 @@ else() set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") - set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80") + set(paddle_known_gpu_archs11 "52 60 61 70 75 80") endif() ###################################################################################### diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake index 8472a0743b91e69d823ed62f94b55045a31aaabc..bc8611f3862cd14c0de493564ea82a1c9ce66667 100644 --- a/cmake/external/dgc.cmake +++ b/cmake/external/dgc.cmake @@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc") SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) -SET(DGC_URL "http://fleet.bj.bcebos.com/collective_ef2216a.tgz") +SET(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz") INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) cache_third_party(extern_dgc @@ -30,7 +30,7 @@ ExternalProject_Add( extern_dgc ${EXTERNAL_PROJECT_LOG_ARGS} "${DGC_DOWNLOAD_CMD}" - URL_MD5 "2f67549fd5f1262383d83289abc4f88f" + URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251" PREFIX "${DGC_PREFIX_DIR}" SOURCE_DIR "${DGC_SOURCES_DIR}" CONFIGURE_COMMAND "" diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index 8a655b2954dea5d6b864616ed2f4d19b167c4be8..3da550519bae2a12139873a2a84680debbaa8f4c 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) if(NOT LITE_GIT_TAG) - set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa) + set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184) endif() if(NOT CUDA_ARCH_NAME) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index ae870b766fc3349ea53628e14c68ab9a5826213f..c0adda0da31ae1e7425ddfb352971444c09d5615 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn) SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git) -SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677) +SET(MKLDNN_REPOSITORY https://github.com/oneapi-src/oneDNN.git) +SET(MKLDNN_TAG 64a48f9565aa72f6359917b3406328075a409939) # Introduce variables: # * CMAKE_INSTALL_LIBDIR diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 0f6b1c182d5590354c8a970eea339a3e23846f39..ac6cf624e82c0a346fea42fa29fe9bab6ace8d47 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git) -set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e) +set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9d07a0979d9392c9b2ab78562f8e0ceb8fc5d722..415e07c75425345f5f1ad29a8544e02a5bfb12e4 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag) endfunction() CheckCompilerCXX11Flag() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +if (WITH_GPU) + if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +endif() # safe_set_flag # # Set a compile flag only if compiler is support diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index c9442e8f843ac152cac02908799a8d24f5951e58..9edfcb967abc26a25a94d368298c1c475295019f 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ENDIF() if(WITH_GPU) - include(external/cub) # download cub - list(APPEND third_party_deps extern_cub) - + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + include(external/cub) # download cub + list(APPEND third_party_deps extern_cub) + endif() set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage endif(WITH_GPU) diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc index ab987fb56686594f505e63b6664c2176e5a4ad89..0dd2768ccb9ffa1dc7b85dca500095f8c10479c3 100644 --- a/paddle/fluid/framework/c/c_api.cc +++ b/paddle/fluid/framework/c/c_api.cc @@ -49,7 +49,8 @@ std::vector PD_GetGradOpDescStrs( for (size_t i = 0; i < op_num; ++i) { PADDLE_ENFORCE_EQ( grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true, - "Cannot serialize message."); + paddle::platform::errors::Unavailable( + "Cannot serialize operator desc message.")); } } return ret; diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100755 new mode 100644 index 551d1342edeb335d1cad4782f85ae9f94f8739bd..edd1700ae7284c77883af6abd2cd7d511097685f --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -36,7 +36,10 @@ message AMPConfig { repeated string custom_black_varnames = 9; } -message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; } +message LocalSGDConfig { + optional int32 k_steps = 1 [ default = 1 ]; + optional int32 begin_step = 2 [ default = 1 ]; +} message GradientMergeConfig { optional int32 k_steps = 1 [ default = 1 ]; @@ -52,6 +55,8 @@ message DGCConfig { message LarsConfig { optional float lars_coeff = 1 [ default = 0.001 ]; optional float lars_weight_decay = 2 [ default = 0.0005 ]; + optional float epsilon = 3 [ default = 0.0 ]; + repeated string exclude_from_weight_decay = 4; } message LambConfig { diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index d5a25605cf81147b520bf541e38f4f75e53ae756..33a91388fd8cc97d181df46ab826d384860d38f5 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false; void NCCLWrapper::InitNCCL() { #if defined(PADDLE_WITH_NCCL) - PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, nccl_info_.my_global_rank_)); #endif @@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { NCCLInfo NCCLWrapper::GetNCCLId() { #if defined(PADDLE_WITH_NCCL) - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); #endif return nccl_info_; } @@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, nccl_info_.local_rank_ = local_rank; nccl_info_.my_global_rank_ = global_rank; nccl_info_.global_ranks_ = ranks; - PADDLE_ENFORCE(cudaSetDevice(local_rank)); - PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_))); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_))); #endif return; } @@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, auto var = scope.FindVar(name); LoDTensor* tensor = var->GetMutable(); int32_t total_size = tensor->numel(); - PADDLE_ENFORCE(platform::dynload::ncclBcast( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(tensor->data()), total_size, ncclFloat, root_rank, nccl_info_.comm_, nccl_info_.stream_)); cudaStreamSynchronize(nccl_info_.stream_); diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 7f7f426d0e28224932fc96a3fefa0df1279e6475..4682bfc264b68997abd0a87233c5ed39e7e50a63 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -42,7 +42,8 @@ void ThreadPool::Init() { num_threads = FLAGS_dist_threadpool_size; VLOG(1) << "set dist_threadpool_size to " << num_threads; } - PADDLE_ENFORCE_GT(num_threads, 0); + PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument( + "The number of threads is 0.")); threadpool_.reset(new ThreadPool(num_threads)); } } @@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() { } if (tasks_.empty()) { - PADDLE_THROW("This thread has no task to Run"); + PADDLE_THROW(platform::errors::Unavailable( + "Current thread has no task to Run.")); } // pop a task from the task queue diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 654d81116b280bb6a52af3f83aeec284387f3b63..09528b6fc35bf49ac3110440a62aba3200341e15 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -91,7 +91,8 @@ class ThreadPool { { std::unique_lock lock(mutex_); if (!running_) { - PADDLE_THROW("enqueue on stopped ThreadPool"); + PADDLE_THROW(platform::errors::Unavailable( + "Task is enqueued into stopped ThreadPool.")); } tasks_.push(std::move(task)); } diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index f3ea1f624ee836a483c37c2addb4d9766e87c107..2ee0b17b64b6df7a2f66b208f5b5879683db6656 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) { } break; default: PADDLE_THROW( - "Setting 'sub_tensor_number' is not supported by the type of var %s.", - this->Name()); + platform::errors::Unavailable("Setting 'sub_tensor_number' is not " + "supported by the %s type variable.", + this->Name())); } } @@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const { break; default: PADDLE_THROW( - "Getting 'sub_tensor_number' is not supported by the type of var %s.", - this->Name()); + platform::errors::Unavailable("Getting 'sub_tensor_number' is not " + "supported by the %s type variable.", + this->Name())); } } @@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level); break; default: - PADDLE_THROW( - "Setting 'lod_level' is not supported by the type of var %s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Setting 'lod_level' is not supported by the %s type variable.", + this->Name())); } } @@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { } } break; default: - PADDLE_THROW( - "Setting 'lod_levels' is not supported by the type of var %s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Setting 'lod_levels' is not supported by the %s type variable", + this->Name())); } } @@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const { case proto::VarType::LOD_TENSOR_ARRAY: return desc_.type().tensor_array().lod_level(); default: - PADDLE_THROW( - "Getting 'lod_level' is not supported by the type of var %s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Getting 'lod_level' is not supported by the %s type variable.", + this->Name())); } } @@ -187,15 +189,19 @@ std::vector VarDesc::GetLoDLevels() const { return res; break; default: - PADDLE_THROW( - "Getting 'lod_levels' is not supported by the type of var %s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Getting 'lod_levels' is not supported by the %s type variable.", + this->Name())); } } const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { - PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); + PADDLE_ENFORCE_EQ( + desc_.has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); + PADDLE_ENFORCE_EQ( + desc_.type().has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); switch (desc_.type().type()) { case proto::VarType::SELECTED_ROWS: return desc_.type().selected_rows(); @@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { case proto::VarType::LOD_TENSOR_ARRAY: return desc_.type().tensor_array().tensor(); default: - PADDLE_THROW( - "Getting 'tensor_desc' is not supported by the type of var %s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Getting 'tensor_desc' is not supported by the %s type variable.", + this->Name())); } } std::vector VarDesc::tensor_descs() const { - PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + PADDLE_ENFORCE_EQ( + desc_.has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); std::vector res; res.reserve(GetTensorDescNum()); switch (desc_.type().type()) { @@ -221,16 +229,19 @@ std::vector VarDesc::tensor_descs() const { } return res; default: - PADDLE_THROW( - "Getting 'tensor_descs' is not supported by the type of var " - "%s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Getting 'tensor_descs' is not supported by the %s type variable.", + this->Name())); } } proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { - PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); + PADDLE_ENFORCE_EQ( + desc_.has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); + PADDLE_ENFORCE_EQ( + desc_.type().has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); switch (desc_.type().type()) { case proto::VarType::SELECTED_ROWS: return desc_.mutable_type()->mutable_selected_rows(); @@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); default: PADDLE_THROW( - "Getting 'mutable_tensor_desc' is not supported by the type of var " - "%s.", - this->Name()); + platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not " + "supported by the %s type variable.", + this->Name())); } } std::vector VarDesc::mutable_tensor_descs() { - PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); + PADDLE_ENFORCE_EQ( + desc_.has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); + PADDLE_ENFORCE_EQ( + desc_.type().has_type(), true, + platform::errors::NotFound("The variable's type was not be set.")); std::vector res; res.reserve(GetTensorDescNum()); switch (desc_.type().type()) { @@ -259,10 +274,9 @@ std::vector VarDesc::mutable_tensor_descs() { } return res; default: - PADDLE_THROW( - "Getting 'tensor_descs' is not supported by the type of var " - "%s.", - this->Name()); + PADDLE_THROW(platform::errors::Unavailable( + "Getting 'tensor_descs' is not supported by the %s type variable.", + this->Name())); } } diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 43e9ed553bea84aaaaa18a46fe81f06a18b124af..8affeda67b3d07d67ceed2b657b285210e1bd076 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) { case proto::VarType::READER: return static_cast(type); default: - PADDLE_THROW("ToVarType:Unsupported type %d", type); + PADDLE_THROW(platform::errors::Unavailable( + "ToVarType method Unsupported type %d.", type)); } } @@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { visitor(var.Get()); return; default: - PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); + PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.", + ToTypeName(var.Type()))); } } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 5c90b07149ec5575f9907e41cc65a826421cf3ec..1e5e8d657556059bae8129e7c0b6ea6b57cbf63f 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl { static_assert(!std::is_same::value, "Type cannot be void"); constexpr int kId = VarTypeTrait::kId; auto type = std::type_index(typeid(Type)); - PADDLE_ENFORCE(id_to_type->count(kId) == 0, - "Registered duplicate type id %d for type %s", kId, - type.name()); - PADDLE_ENFORCE(type_to_id->count(type) == 0, - "Registered duplicate type_index %s for id %d", type.name(), - kId); + PADDLE_ENFORCE_EQ( + id_to_type->count(kId), 0, + platform::errors::AlreadyExists( + "Registered duplicate type id %d for type %s.", kId, type.name())); + PADDLE_ENFORCE_EQ( + type_to_id->count(type), 0, + platform::errors::AlreadyExists( + "Registered duplicate type index %s for id %d.", type.name(), kId)); id_to_type->emplace(kId, type); type_to_id->emplace(type, kId); VarIdToTypeIndexMapInitializerImplsecond; } static int ToTypeId(const std::type_index &type) { auto it = Instance().type_to_id_map_.find(type); - PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), - "VarType %s is not registered.", type.name()); + PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(), + platform::errors::NotFound( + "Variable Type %s is not registered.", type.name())); return it->second; } diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index 67e17410a29aff435921f46eeb2691a025d5a9eb..ec42aa30e5abb3dc3d03633cae31d95999d82731 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::RAW) { // GetMutable will be called in operator } else { - PADDLE_THROW( + PADDLE_THROW(platform::errors::Unavailable( "Variable type %d is not in " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", - var_type); + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].", + var_type)); } } @@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) { auto *dst_t = tmp_grad_slr->mutable_value(); framework::TensorCopy(src_t, cpu_place, dst_t); } else { - PADDLE_THROW("unknown var type to copy"); + PADDLE_THROW( + platform::errors::Unavailable("Unknown variable type to copy.")); } } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 27bae7a71ea192ac08e4e87cb7bcdb8b84e29dc8..8d28b8ace26ae51b8fb6b3dcb240c08b1686b143 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -218,6 +218,10 @@ struct Argument { DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); + // Only used in paddle-lite subgraph. + DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads, + int); + private: std::unordered_set valid_fields_; }; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index cd8d86d72938417112e17e86e5cc6dd12254a8d1..d52d71f148c36fa456aaa703c0df2dbccd901205 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_xpu", new bool(argument->use_xpu())); pass->Set("xpu_l3_workspace_size", new int(argument->xpu_l3_workspace_size())); + pass->Set("cpu_math_library_num_threads", + new int(argument->cpu_math_library_num_threads())); } disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 6b16a481ddedbad0956d1358de95842ea9a3a101..e78d5ef017b7f8451556d388bf3b8c0a55276a59 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine( bool enable_int8 = Get("enable_int8"); bool use_xpu = Get("use_xpu"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); + int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); lite_api::TargetType target_type; if (use_gpu) { @@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine( // Notice: The ordering here determines the device where the // input tensor of the Lite engine is located, and then affects // whether tensor sharing is feasible. - paddle::lite::Place({target_type, precision_type}), - paddle::lite::Place({target_type, PRECISION(kInt64)}), - paddle::lite::Place({target_type, PRECISION(kFloat)}), - paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}), + paddle::lite_api::Place({target_type, precision_type}), + paddle::lite_api::Place({target_type, PRECISION(kInt64)}), + paddle::lite_api::Place({target_type, PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}), }; + config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; if (dump_model) { lite::StrToBinaryFile("./model.bin", config.model); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index fb0ad31a3e612201de32813a65970c73b73b611b..c0d3b14e0e43e10332d18ddd217a8a50245ab5ed 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -53,12 +53,10 @@ if(WITH_TESTING) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) - set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST") elseif(WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) - set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST") endif() endif() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 500aa8341d6a61056f6f80f82c6f28bb569eb772..64dfdda54aceefef1d89ccb2e3a917ad47c53966 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() { } if (config_.lite_engine_enabled()) { + argument_.SetCpuMathLibraryNumThreads( + config_.cpu_math_library_num_threads()); argument_.SetLitePrecisionMode(config_.lite_precision_mode_); argument_.SetLitePassesFilter(config_.lite_passes_filter_); argument_.SetLiteOpsFilter(config_.lite_ops_filter_); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index a5a0405ac88ad8e94a65d728557ab9298eae56dc..46755eeda660ae8f4c54d318f6450fbf1d48b1f7 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -21,15 +21,21 @@ namespace paddle { void ZeroCopyTensor::Reshape(const std::vector &shape) { - PADDLE_ENFORCE(!name_.empty(), - "Need to SetName first, so that the corresponding tensor can " - "be retrieved."); - PADDLE_ENFORCE(input_or_output_, - "Can't reshape the output tensor, it is readonly"); - PADDLE_ENFORCE(scope_); + PADDLE_ENFORCE_EQ( + name_.empty(), false, + platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + PADDLE_ENFORCE_EQ(input_or_output_, true, + platform::errors::PermissionDenied( + "Can't reshape the output tensor, it is readonly")); + PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet( + "The scope should not be nullptr.")); auto *scope = static_cast(scope_); auto *var = scope->FindVar(name_); - PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); auto *tensor = var->GetMutable(); tensor->Resize(framework::make_ddim(shape)); } @@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { EAGER_GET_TENSOR; PADDLE_ENFORCE_GT( tensor->numel(), 0, - "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" - "function before retrieving mutable_data from input tensor."); + platform::errors::PreconditionNotMet( + "You should call ZeroCopyTensor::Reshape(const std::vector " + "&shape)" + "function before retrieving mutable_data from input tensor.")); switch (static_cast(place)) { case static_cast(PaddlePlace::kCPU): { return tensor->mutable_data(platform::CPUPlace()); @@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { return tensor->mutable_data(platform::CUDAPlace(device_)); } default: - PADDLE_THROW("Unsupported place: %d", static_cast(place)); + PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d", + static_cast(place))); break; } return nullptr; @@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const { template void ZeroCopyTensor::copy_from_cpu(const T *data) { EAGER_GET_TENSOR; - PADDLE_ENFORCE_GE( - tensor->numel(), 0, - "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" - "function before copying data from cpu."); + PADDLE_ENFORCE_GE(tensor->numel(), 0, + platform::errors::PreconditionNotMet( + "You should call ZeroCopyTensor::Reshape(const " + "std::vector &shape)" + "function before copying data from cpu.")); size_t ele_size = tensor->numel() * sizeof(T); if (place_ == PaddlePlace::kCPU) { @@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), data, ele_size, dev_ctx->stream()); #else - PADDLE_THROW("Not compiled with CUDA, should not reach here."); + PADDLE_THROW(platform::errors::Unavailable( + "Not compiled with CUDA, should not reach here.")); #endif } } @@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { cudaStreamSynchronize(dev_ctx->stream()); #else - PADDLE_THROW("Not compile with CUDA, should not reach here."); + PADDLE_THROW(platform::errors::Unavailable( + "Not compile with CUDA, should not reach here.")); #endif } } @@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data( PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { - PADDLE_ENFORCE(!name_.empty(), - "Need to SetName first, so that the corresponding tensor can " - "be retrieved."); - PADDLE_ENFORCE(scope_); + PADDLE_ENFORCE_EQ( + name_.empty(), false, + platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet( + "The scope should not be nullptr.")); auto *scope = static_cast(scope_); auto *var = scope->FindVar(name_); - PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); auto *tensor = var->GetMutable(); return tensor; } std::vector ZeroCopyTensor::shape() const { EAGER_GET_TENSOR; - PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); + PADDLE_ENFORCE_NOT_NULL( + tensor_, platform::errors::PreconditionNotMet( + "Not found tensor called %s in the scope", name_)); return framework::vectorize(tensor->dims()); } diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index a58b510ecf16a4bb2e2be9f4c2946a550ea20d2d..5dc4430fde4715fe11c19ce8adc7397f77391fc3 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -31,12 +31,30 @@ limitations under the License. */ #include "paddle_analysis_config.h" // NOLINT #include "paddle_api.h" // NOLINT +/// +/// \file paddle_inference_api.h +/// +/// \brief Paddle Inference API +/// +/// \author paddle-infer@baidu.com +/// \date 2020-09-01 +/// \since 2.0.0-beta +/// + namespace paddle_infer { using DataType = paddle::PaddleDType; using PlaceType = paddle::PaddlePlace; using PrecisionType = paddle::AnalysisConfig::Precision; using Config = paddle::AnalysisConfig; +/// +/// \class Tensor +/// +/// \brief Represents an n-dimensional array of values. +/// The Tensor is used to store the input or output of the network. +/// It is obtained through Predictor::GetinputHandle() +/// and Predictor::GetOutputHandle() interface. +/// class PD_INFER_DECL Tensor { public: // Can only be created by predictor->GetInputHandle(cosnt std::string& name) @@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor { Tensor() = delete; explicit Tensor(std::unique_ptr&& tensor) : tensor_(std::move(tensor)) {} + + /// + /// \brief Reset the shape of the tensor. + /// Generally it's only used for the input tensor. + /// Reshape must be called before calling mutable_data() or CopyFromCpu() + /// \param shape The shape to set. + /// void Reshape(const std::vector& shape); + /// + /// \brief Copy the host memory to tensor data. + /// It's usually used to set the input tensor data. + /// \param data The pointer of the data, from which the tensor will copy. + /// template void CopyFromCpu(const T* data); - // should add the place + /// + /// \brief Get the memory pointer in CPU or GPU with specific data type. + /// Please Reshape the tensor first before call this. + /// It's usually used to get input data pointer. + /// \param place The place of the tensor. + /// \return The tensor data buffer pointer. + /// template T* mutable_data(PlaceType place); + /// + /// \brief Copy the tensor data to the host memory. + /// It's usually used to get the output tensor data. + /// \param[out] data The tensor will copy the data to the address. + /// template void CopyToCpu(T* data); + /// + /// \brief Get the memory pointer directly. + /// It's usually used to get the output data pointer. + /// \param[out] place To get the device type of the tensor. + /// \param[out] size To get the data size of the tensor. + /// \return The tensor data buffer pointer. + /// template T* data(PlaceType* place, int* size) const; + /// + /// \brief Set lod info of the tensor. + /// More about LOD can be seen here: + /// https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor + /// \param x the lod info. + /// void SetLoD(const std::vector>& x); + + /// \brief Return the lod info of the tensor. std::vector> lod() const; + /// \brief Return the data type of the tensor. + /// It's usually used to get the output tensor data type. + /// \return The data type of the tensor. DataType type() const; + /// \brief Return the shape of the Tensor. std::vector shape() const; + + /// \brief Return the name of the tensor. const std::string& name() const; private: std::unique_ptr tensor_; }; +/// +/// \class Predictor +/// +/// \brief Predictor is the interface for model prediction. +/// +/// The predictor has the following typical uses: +/// +/// Get predictor +/// \code{cpp} +/// auto predictor = CreatePredictor(config); +/// \endcode +/// +/// Get input or output names +/// \code{cpp} +/// auto input_names = predictor->GetInputNames(); +/// auto output_names = predictor->GetOutputNames(); +/// \endcode +/// +/// Get input or output handle +/// \code{cpp} +/// auto input_t = predictor->GetInputHandle(input_names[0]); +/// auto output_t = predictor->GetOutputHandle(output_names[0]); +/// \endcode +/// +/// Run predictor +/// \code{cpp} +/// predictor->Run(); +/// \endcode +/// class PD_INFER_DECL Predictor { public: Predictor() = delete; @@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor { explicit Predictor(std::unique_ptr&& pred) : predictor_(std::move(pred)) {} + /// + /// \brief Construct a new Predictor object + /// + /// \param[in] Config config + /// explicit Predictor(const Config& config); + /// + /// \brief Get the input names + /// + /// \return input names + /// std::vector GetInputNames(); + + /// + /// \brief Get the Input Tensor object + /// + /// \param[in] name input name + /// \return input tensor + /// std::unique_ptr GetInputHandle(const std::string& name); + /// + /// \brief Run the prediction engine + /// + /// \return Whether the function executed successfully + /// bool Run(); + /// + /// \brief Get the output names + /// + /// \return output names + /// std::vector GetOutputNames(); + + /// + /// \brief Get the Output Tensor object + /// + /// \param[in] name otuput name + /// \return output tensor + /// std::unique_ptr GetOutputHandle(const std::string& name); + /// + /// \brief Clone to get the new predictor. thread safe. + /// + /// \return get a new predictor + /// std::unique_ptr Clone(); + + /// \brief Clear the intermediate tensors of the predictor void ClearIntermediateTensor(); private: std::unique_ptr predictor_; }; +/// +/// \brief A factory to help create predictors. +/// +/// Usage: +/// +/// \code{.cpp} +/// Config config; +/// ... // change the configs. +/// auto predictor = CreatePredictor(config); +/// \endcode +/// PD_INFER_DECL std::shared_ptr CreatePredictor( const Config& config); // NOLINT + PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype); PD_INFER_DECL std::string GetVersion(); @@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const { namespace paddle_infer { namespace services { +/// +/// \class PredictorPool +/// +/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for +/// use in multi-threaded situations. According to the thread id, the +/// corresponding Predictor is taken out from PredictorPool to complete the +/// prediction. +/// class PD_INFER_DECL PredictorPool { public: PredictorPool() = delete; PredictorPool(const PredictorPool&) = delete; PredictorPool& operator=(const PredictorPool&) = delete; + /// \brief Construct the predictor pool with \param size predictor instances. explicit PredictorPool(const Config& config, size_t size = 1); + + /// \brief Get \param id-th predictor. Predictor* Retrive(size_t idx); private: diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc index 821dff2f036c1892570a8ade5b40363251c7f531..07493c742c4fa906e7c4817e328e7d4f81afbffa 100644 --- a/paddle/fluid/inference/capi/c_api.cc +++ b/paddle/fluid/inference/capi/c_api.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" +#include "paddle/fluid/platform/enforce.h" using paddle::ConvertToACPrecision; using paddle::ConvertToPaddleDType; @@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) { } void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) { - PADDLE_ENFORCE_NOT_NULL(buf); + PADDLE_ENFORCE_NOT_NULL(buf, + paddle::platform::errors::InvalidArgument( + "The pointer of Buffer shouldn't be nullptr")); buf->buf.Resize(length); } void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) { - PADDLE_ENFORCE_NOT_NULL(buf); + PADDLE_ENFORCE_NOT_NULL(buf, + paddle::platform::errors::InvalidArgument( + "The pointer of Buffer shouldn't be nullptr")); buf->buf.Reset(data, length); } bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) { - PADDLE_ENFORCE_NOT_NULL(buf); + PADDLE_ENFORCE_NOT_NULL(buf, + paddle::platform::errors::InvalidArgument( + "The pointer of Buffer shouldn't be nullptr")); return buf->buf.empty(); } void* PD_PaddleBufData(PD_PaddleBuf* buf) { - PADDLE_ENFORCE_NOT_NULL(buf); + PADDLE_ENFORCE_NOT_NULL(buf, + paddle::platform::errors::InvalidArgument( + "The pointer of Buffer shouldn't be nullptr")); return buf->buf.data(); } size_t PD_PaddleBufLength(PD_PaddleBuf* buf) { - PADDLE_ENFORCE_NOT_NULL(buf); + PADDLE_ENFORCE_NOT_NULL(buf, + paddle::platform::errors::InvalidArgument( + "The pointer of Buffer shouldn't be nullptr")); return buf->buf.length(); } diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h index 2dd827229779d34384df2b3ba5f398c77db8369a..7e69b7210768e5af9e8f4150883a608a1517a13c 100644 --- a/paddle/fluid/inference/capi/c_api_internal.h +++ b/paddle/fluid/inference/capi/c_api_internal.h @@ -18,7 +18,6 @@ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" -#include "paddle/fluid/platform/enforce.h" using PD_PaddleDType = paddle::PaddleDType; using PD_ACPrecision = paddle::AnalysisConfig::Precision; diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index b99abc06b27ecb9686b4c6e883aaaf8b3e592415..af8d4a69ecf24862ca5f282655b72ef37307c1c8 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -20,6 +20,7 @@ #include #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" +#include "paddle/fluid/platform/enforce.h" using paddle::ConvertToACPrecision; using paddle::ConvertToPaddleDType; @@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) { void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir, const char* params_path) { LOG(INFO) << model_dir; - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); LOG(INFO) << std::string(model_dir); if (!params_path) { config->config.SetModel(std::string(model_dir)); @@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir, } void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetProgFile(std::string(x)); } void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetParamsFile(std::string(x)); } void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetOptimCacheDir(std::string(opt_cache_dir)); } const char* PD_ModelDir(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.model_dir().c_str(); } const char* PD_ProgFile(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.prog_file().c_str(); } const char* PD_ParamsFile(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.params_file().c_str(); } void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb, int device_id) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableUseGpu(static_cast(memory_pool_init_size_mb), device_id); } void PD_DisableGpu(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.DisableGpu(); } bool PD_UseGpu(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.use_gpu(); } int PD_GpuDeviceId(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.gpu_device_id(); } int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.memory_pool_init_size_mb(); } float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.fraction_of_gpu_memory_for_pool(); } void PD_EnableCUDNN(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableCUDNN(); } bool PD_CudnnEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.cudnn_enabled(); } void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SwitchIrOptim(x); } bool PD_IrOptim(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.ir_optim(); } void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SwitchUseFeedFetchOps(x); } bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.use_feed_fetch_ops_enabled(); } void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SwitchSpecifyInputNames(x); } bool PD_SpecifyInputName(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.specify_input_name(); } @@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size, int max_batch_size, int min_subgraph_size, Precision precision, bool use_static, bool use_calib_mode) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableTensorRtEngine( workspace_size, max_batch_size, min_subgraph_size, paddle::ConvertToACPrecision(precision), use_static, use_calib_mode); } bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.tensorrt_engine_enabled(); } void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SwitchIrDebug(x); } void PD_EnableMKLDNN(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableMKLDNN(); } void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetMkldnnCacheCapacity(capacity); } bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.mkldnn_enabled(); } void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config, int cpu_math_library_num_threads) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads); } int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.cpu_math_library_num_threads(); } void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableMkldnnQuantizer(); } bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.mkldnn_quantizer_enabled(); } void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound( - "PD_AnalysisConfig should not be null")); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableMkldnnBfloat16(); } bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound( - "PD_AnalysisConfig should not be null")); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.mkldnn_bfloat16_enabled(); } void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer, size_t prog_buffer_size, const char* params_buffer, size_t params_buffer_size) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer, params_buffer_size); } bool PD_ModelFromMemory(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.model_from_memory(); } void PD_EnableMemoryOptim(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableMemoryOptim(); } bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.enable_memory_optim(); } void PD_EnableProfile(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.EnableProfile(); } bool PD_ProfileEnabled(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.profile_enabled(); } void PD_SetInValid(PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); config->config.SetInValid(); } bool PD_IsValid(const PD_AnalysisConfig* config) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); return config->config.is_valid(); } diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc index 8aa1e2a7b7f9b99a1636ca2e7396089ab2ae7e15..0509a6190211c25b6461c1d683daa6b33110b4e0 100644 --- a/paddle/fluid/inference/capi/pd_predictor.cc +++ b/paddle/fluid/inference/capi/pd_predictor.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" +#include "paddle/fluid/platform/enforce.h" using paddle::ConvertToACPrecision; using paddle::ConvertToPaddleDType; @@ -81,7 +82,10 @@ extern "C" { bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs, int in_size, PD_Tensor** output_data, int* out_size, int batch_size) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); VLOG(3) << "Predoctor: PD_PredictorRun. "; static std::map> predictors; @@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs, bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, PD_ZeroCopyData* inputs, int in_size, PD_ZeroCopyData** output, int* out_size) { - PADDLE_ENFORCE_NOT_NULL(config); + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); static std::map> predictors; if (!predictors.count(config->config.model_dir())) { @@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, input_t->copy_from_cpu(static_cast(inputs[i].data)); break; default: - CHECK(false) << "Unsupport data type."; + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupported data type.")); break; } } @@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor, input->copy_from_cpu(static_cast(tensor->data.data)); break; default: - CHECK(false) << "Unsupport data type."; + PADDLE_THROW( + paddle::platform::errors::InvalidArgument("Unsupported data type.")); break; } @@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) { output->copy_to_cpu(reinterpret_cast(tensor->data.data)); break; default: - CHECK(false) << "Unsupport data type."; + PADDLE_THROW( + paddle::platform::errors::InvalidArgument("Unsupported data type.")); break; } } diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc index b4811f1d6ff192659fa12b33008fe5ac07e6a6c5..9b1eedd7c5a8106a6f6b7be3f682913e2431a3e5 100644 --- a/paddle/fluid/inference/capi/pd_tensor.cc +++ b/paddle/fluid/inference/capi/pd_tensor.cc @@ -19,6 +19,7 @@ #include #include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/paddle_c_api.h" +#include "paddle/fluid/platform/enforce.h" using paddle::ConvertToACPrecision; using paddle::ConvertToPaddleDType; @@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) { } void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); tensor->tensor.name = std::string(name); } void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype); } void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); tensor->tensor.data = buf->buf; } void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); tensor->tensor.shape.assign(shape, shape + size); } const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); return tensor->tensor.name.c_str(); } PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); return ConvertToPDDataType(tensor->tensor.dtype); } PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); PD_PaddleBuf* ret = PD_NewPaddleBuf(); ret->buf = tensor->tensor.data; return ret; } const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) { - PADDLE_ENFORCE_NOT_NULL(tensor); + PADDLE_ENFORCE_NOT_NULL(tensor, + paddle::platform::errors::InvalidArgument( + "The pointer of tensor shouldn't be nullptr")); const std::vector& shape = tensor->tensor.shape; *size = shape.size(); return shape.data(); diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 8e88c94493952ff257ef69bf73f8edebb6ba2eee..5f24ef00bce59e5886d8448cf3f8356e9aeba481 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -20,8 +20,12 @@ #define LITE_WITH_XPU 1 #endif +#ifndef PADDLE_WITH_ARM +#define LITE_WITH_X86 1 +#endif + #include "paddle/fluid/inference/lite/engine.h" -#include "lite/api/paddle_use_passes.h" +#include namespace paddle { namespace inference { @@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const { return engines_.at(name).get() != nullptr; } -paddle::lite::Predictor* EngineManager::Get(const std::string& name) const { +paddle::lite_api::PaddlePredictor* EngineManager::Get( + const std::string& name) const { return engines_.at(name).get(); } -paddle::lite::Predictor* EngineManager::Create(const std::string& name, - const EngineConfig& cfg) { - if (cfg.valid_places.front().target == TARGET(kCUDA)) { -#ifdef PADDLE_WITH_CUDA - paddle::lite::Env::Init(); +paddle::lite_api::PaddlePredictor* EngineManager::Create( + const std::string& name, const EngineConfig& cfg) { + // config info for predictor. + paddle::lite_api::CxxConfig lite_cxx_config; + lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(), + cfg.param.c_str(), cfg.param.size()); + lite_cxx_config.set_valid_places(cfg.valid_places); +#ifdef PADDLE_WITH_ARM + set_threads.set_threads(cfg.cpu_math_library_num_threads); +#else + lite_cxx_config.set_x86_math_library_num_threads( + cfg.cpu_math_library_num_threads); #endif - } else if (cfg.valid_places.front().target == TARGET(kXPU)) { + #ifdef PADDLE_WITH_XPU - paddle::lite::TargetWrapper::workspace_l3_size_per_thread = - cfg.xpu_l3_workspace_size; + lite_cxx_config.set_xpu_workspace_l3_size_per_thread( + cfg.xpu_l3_workspace_size); #endif - } - auto* p = new paddle::lite::Predictor(); - p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes, - cfg.model_type, cfg.model_from_memory); - engines_[name].reset(p); - return p; + + // create predictor + std::shared_ptr p = + paddle::lite_api::CreatePaddlePredictor(lite_cxx_config); + engines_[name] = std::move(p); + return engines_[name].get(); } void EngineManager::DeleteAll() { for (auto& item : engines_) { - item.second.reset(nullptr); + item.second.reset(); } } diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 345eb682e9fe81d4ec67a31082c1d347a694fd96..5ba487cc24d7d58cd87853a58fc12f1a82c3610d 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -23,12 +23,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wall" #include "lite/api/cxx_api.h" +#include "lite/api/paddle_api.h" #include "lite/api/paddle_place.h" -#include "lite/core/context.h" -#include "lite/core/device_info.h" -#include "lite/core/memory.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" +#include "lite/api/paddle_use_passes.h" #pragma GCC diagnostic pop namespace paddle { @@ -38,25 +35,33 @@ namespace lite { struct EngineConfig { std::string model; std::string param; - paddle::lite::Place prefer_place; - std::vector valid_places; + std::vector valid_places; std::vector neglected_passes; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; bool model_from_memory{true}; + + // for xpu size_t xpu_l3_workspace_size; + + // for x86 or arm + int cpu_math_library_num_threads{1}; + + // for cuda + bool use_multi_stream{false}; }; class EngineManager { public: bool Empty() const; bool Has(const std::string& name) const; - paddle::lite::Predictor* Get(const std::string& name) const; - paddle::lite::Predictor* Create(const std::string& name, - const EngineConfig& cfg); + paddle::lite_api::PaddlePredictor* Get(const std::string& name) const; + paddle::lite_api::PaddlePredictor* Create(const std::string& name, + const EngineConfig& cfg); void DeleteAll(); private: - std::unordered_map> + std::unordered_map> engines_; }; diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index d79a041ccf8a1611247b65b034c03940eabfcccd..33661594b926f284052c85c6a816a17dfff1ce20 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/lite/tensor_utils.h" +#include #include #include #include "paddle/fluid/framework/data_type.h" @@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, } } -void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) { +void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src, + PrecisionType precision_type, + TargetType target_type) { + void* res{nullptr}; + switch (precision_type) { + case PrecisionType::kFloat: + res = static_cast(src->mutable_data(target_type)); + break; + case PrecisionType::kInt8: + res = static_cast(src->mutable_data(target_type)); + break; + case PrecisionType::kInt32: + res = static_cast(src->mutable_data(target_type)); + break; + case PrecisionType::kInt64: + res = static_cast(src->mutable_data(target_type)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported precision type. Now only supports FP32, INT8, INT32 and " + "INT64.")); + break; + } + return res; +} + +int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) { + auto shape = tensor.shape(); + int64_t numel = std::accumulate(shape.begin(), shape.end(), 1, + std::multiplies()); + return numel; +} + +void InitDstTensor(paddle::lite_api::Tensor* dst, + const framework::LoDTensor& src) { // Currently, Lite needs to explicitly specify the target type of // the input tensor. constexpr int empty_size = 0; - dst->mutable_data(GetLiteTargetType(src.place()), empty_size); - dst->set_precision(GetLitePrecisionType(src.type())); - SetLoD(dst->mutable_lod(), src.lod()); + dst->Resize({empty_size}); + GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()), + GetLiteTargetType(src.place())); + dst->SetPrecision(GetLitePrecisionType(src.type())); + paddle::lite::LoD lite_lod; + SetLoD(&lite_lod, src.lod()); + dst->SetLoD(lite_lod); } -void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { +void InitDstTensor(framework::LoDTensor* dst, + const paddle::lite_api::Tensor& src) { constexpr framework::proto::VarType::Type dtype = framework::proto::VarType_Type_FP32; dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()), @@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { } template <> -void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, +void TensorCopyAsync(paddle::lite_api::Tensor* dst, + const framework::LoDTensor& src, const platform::DeviceContext& ctx) { InitDstTensor(dst, src); const platform::Place& src_place = src.place(); @@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, static_cast(src.numel()) * framework::SizeOfType(src.type()); dst->Resize(framework::vectorize(src.dims())); const void* src_data = src.data(); - void* dst_data = dst->mutable_data(bytes); + void* dst_data{nullptr}; + dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()), + GetLiteTargetType(src.place())); VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src << ", dst = " << dst << ", src_type = " << src.type(); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); - VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size(); + VLOG(3) << "[Lite memory size] Bytes = " << bytes; } template <> -void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src, +void TensorCopyAsync(framework::LoDTensor* dst, + const paddle::lite_api::Tensor& src, const platform::DeviceContext& ctx) { - dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize())); + dst->Resize(paddle::framework::make_ddim(src.shape())); InitDstTensor(dst, src); const platform::Place& src_place = GetNativePlace(src.target()); const platform::Place& dst_place = dst->place(); - const size_t bytes = - static_cast(src.numel()) * framework::SizeOfType(dst->type()); - const void* src_data = src.raw_data(); + int64_t src_numel = GetLiteTensorNumel(src); + const size_t bytes = src_numel * framework::SizeOfType(dst->type()); + const void* src_data = src.data(); // When Lite is ready, the source type needs to be modified here. void* dst_data = dst->mutable_data(dst_place, dst->type()); VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src << ", dst = " << dst << ", src_type = " << dst->type(); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); - VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size(); + VLOG(3) << "[Lite memory size] Bytes = " << bytes; } template <> -void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) { - const size_t bytes = - static_cast(src->numel()) * framework::SizeOfType(src->type()); - auto buf = std::make_shared(paddle::lite::Buffer( - src->data(), GetLiteTargetType(src->place()), src->memory_size())); +void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) { dst->Resize(framework::vectorize(src->dims())); - dst->set_precision(GetLitePrecisionType(src->type())); - SetLoD(dst->mutable_lod(), src->lod()); - dst->ResetBuffer(buf, bytes); + dst->ShareExternalMemory(src->data(), src->memory_size(), + GetLiteTargetType(src->place())); + dst->SetPrecision(GetLitePrecisionType(src->type())); + paddle::lite::LoD lite_lod; + SetLoD(&lite_lod, src->lod()); + dst->SetLoD(lite_lod); } template <> -void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) { +void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) { constexpr framework::proto::VarType::Type dtype = framework::proto::VarType_Type_FP32; - void* src_raw_data = src->raw_data(); + void* src_raw_data = + GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target()); + size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float); std::shared_ptr holder( - new memory::allocation::Allocation(src_raw_data, src->memory_size(), + new memory::allocation::Allocation(src_raw_data, memory_size, GetNativePlace(src->target()))); - dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize())); + dst->Resize(paddle::framework::make_ddim(src->shape())); SetLoD(dst->mutable_lod(), src->lod()); dst->ResetHolderWithType(holder, dtype); } diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc index d29bcb76be78f151dc606d9f335e9df9ed19b16b..e505af19d5389c074c5777d0235dfa055d1395a7 100644 --- a/paddle/fluid/inference/lite/test_engine.cc +++ b/paddle/fluid/inference/lite/test_engine.cc @@ -102,10 +102,10 @@ TEST(EngineManager, engine) { config.model_from_memory = true; config.valid_places = { #ifdef PADDLE_WITH_CUDA - paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), #endif - paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), - paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), + paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), }; LOG(INFO) << "Create EngineManager"; @@ -118,7 +118,7 @@ TEST(EngineManager, engine) { ASSERT_EQ(inference::Singleton::Global().Has( unique_key), true); - paddle::lite::Predictor* engine_0 = + paddle::lite_api::PaddlePredictor* engine_0 = inference::Singleton::Global().Get( unique_key); CHECK_NOTNULL(engine_0); diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index eef7bfb68fe06537d09f3f3e7e5c35283d4739ef..a792fb77d6ad483601402506685e2f91066571da 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) { EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC)); } +template +void test_lite_tensor_data_ptr(PrecisionType precision_type) { + void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src, + PrecisionType precision_type, + TargetType target_type); + const int count = 4; + paddle::lite::Tensor lite_tensor; + lite_tensor.Resize({count}); + auto* lite_tensor_data = lite_tensor.mutable_data(); + for (size_t i = 0; i < count; ++i) { + lite_tensor_data[i] = i; + } + paddle::lite_api::Tensor lite_api_tensor(&lite_tensor); + T* data = static_cast(GetLiteTensorDataPtr( + &lite_api_tensor, precision_type, TargetType::kHost)); + for (size_t i = 0; i < count; ++i) { + CHECK_EQ(data[i], static_cast(i)) << "the i-th num is not correct."; + } +} + +TEST(LiteEngineOp, GetLiteTensorDataPtr) { + test_lite_tensor_data_ptr(PrecisionType::kInt64); + test_lite_tensor_data_ptr(PrecisionType::kInt32); + test_lite_tensor_data_ptr(PrecisionType::kInt8); + EXPECT_ANY_THROW(test_lite_tensor_data_ptr(PrecisionType::kUnk)); +} + void test_tensor_copy(const platform::DeviceContext& ctx) { // Create LoDTensor. std::vector vector({1, 2, 3, 4}); @@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { lod_tensor.set_lod(lod); // Create lite::Tensor and copy. paddle::lite::Tensor lite_tensor; - TensorCopyAsync(&lite_tensor, lod_tensor, ctx); + paddle::lite_api::Tensor lite_api_tensor(&lite_tensor); + TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx); // Copy to LoDTensor. framework::LoDTensor lod_tensor_n; - TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); + TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(ctx.GetPlace())) { platform::GpuStreamSync( @@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) { lod_tensor.set_lod(lod); // Create lite::Tensor and share. paddle::lite::Tensor lite_tensor; - TensorDataShare(&lite_tensor, &lod_tensor); + paddle::lite_api::Tensor lite_api_tensor(&lite_tensor); + TensorDataShare(&lite_api_tensor, &lod_tensor); // Copy to LoDTensor. framework::LoDTensor lod_tensor_n; - TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); + TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); std::vector result; TensorToVector(lod_tensor_n, ctx, &result); ASSERT_EQ(result, vector); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 22be877493272cd393538fd4f04184e77d38e2db..754979f77acd7a3b4818cdf16ef9c525bf1d82ea 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector *buffers, void TensorRTEngine::FreezeNetwork() { freshDeviceId(); VLOG(3) << "TRT to freeze network"; - PADDLE_ENFORCE(infer_builder_ != nullptr, - "Call InitNetwork first to initialize network."); - PADDLE_ENFORCE_EQ(network() != nullptr, true, - platform::errors::InvalidArgument( - "Call InitNetwork first to initialize network.")); + PADDLE_ENFORCE_NOT_NULL(infer_builder_, + platform::errors::InvalidArgument( + "Inference builder of TRT is null. Please make " + "sure you call InitNetwork first.")); + PADDLE_ENFORCE_NOT_NULL(network(), + platform::errors::InvalidArgument( + "Call InitNetwork first to initialize network.")); // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); @@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() { } else { infer_engine_.reset(infer_builder_->buildCudaEngine(*network())); } - PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); + PADDLE_ENFORCE_NOT_NULL( + infer_engine_, platform::errors::Fatal( + "Build TensorRT cuda engine failed! Please recheck " + "you configurations related to paddle-TensorRT.")); } nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, @@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, platform::errors::InvalidArgument( "The TRT network should be initialized first.")); auto *input = network()->addInput(name.c_str(), dtype, dims); - PADDLE_ENFORCE(input, "infer network add input %s failed", name); - PADDLE_ENFORCE(input->isNetworkInput()); + PADDLE_ENFORCE_NOT_NULL( + input, platform::errors::InvalidArgument("Adding input %s failed in " + "TensorRT inference network. " + "Please recheck your input.", + name)); + PADDLE_ENFORCE_EQ(input->isNetworkInput(), true, + platform::errors::InvalidArgument( + "Input %s is not the input of TRT inference network. " + "Please recheck your input.", + name)); TensorRTEngine::SetITensor(name, input); return input; } @@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, const std::string &name) { auto *output = layer->getOutput(offset); SetITensor(name, output); - PADDLE_ENFORCE(output != nullptr); + PADDLE_ENFORCE_NOT_NULL( + output, platform::errors::InvalidArgument( + "The output %s of TRT engine should not be null.", name)); output->setName(name.c_str()); - PADDLE_ENFORCE(!output->isNetworkInput()); + PADDLE_ENFORCE_EQ(output->isNetworkInput(), false, + platform::errors::InvalidArgument( + "The output %s of TRT engine should not be the input " + "of the network at the same time.", + name)); network()->markOutput(*output); - PADDLE_ENFORCE(output->isNetworkOutput()); + PADDLE_ENFORCE_EQ( + output->isNetworkOutput(), true, + platform::errors::InvalidArgument( + "The output %s of TRT engine should be the output of the network.", + name)); } void TensorRTEngine::DeclareOutput(const std::string &name) { auto *output = TensorRTEngine::GetITensor(name); - PADDLE_ENFORCE(output != nullptr); + PADDLE_ENFORCE_NOT_NULL( + output, platform::errors::InvalidArgument( + "The output %s of TRT engine should not be null.", name)); output->setName(name.c_str()); - PADDLE_ENFORCE(!output->isNetworkInput()); + PADDLE_ENFORCE_EQ(output->isNetworkInput(), false, + platform::errors::InvalidArgument( + "The output %s of TRT engine should not be the input " + "of the network at the same time.", + name)); network()->markOutput(*output); } void TensorRTEngine::SetITensor(const std::string &name, nvinfer1::ITensor *tensor) { - PADDLE_ENFORCE(tensor != nullptr); - PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", - name); + PADDLE_ENFORCE_NOT_NULL( + tensor, platform::errors::InvalidArgument( + "Tensor named %s of TRT engine should not be null.", name)); + PADDLE_ENFORCE_EQ( + 0, itensor_map_.count(name), + platform::errors::InvalidArgument( + "Tensor named %s of TRT engine should not be duplicated", name)); itensor_map_[name] = tensor; } nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { - PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); + PADDLE_ENFORCE_EQ(itensor_map_.count(name), true, + platform::errors::NotFound( + "Tensor named %s is not found in TRT engine", name)); return itensor_map_[name]; } @@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, std::string splitter = "__"; std::string name_with_suffix = name + splitter + name_suffix; platform::CPUPlace cpu_place; - PADDLE_ENFORCE_EQ( - weight_map.count(name_with_suffix), 0, - "During TRT Op converter: We set weight %s with the same name " - "twice into the weight_map", - name_with_suffix); + PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0, + platform::errors::AlreadyExists( + "The weight named %s is set into the weight map " + "twice in TRT OP converter.", + name_with_suffix)); weight_map[name_with_suffix].reset(new framework::Tensor()); weight_map[name_with_suffix]->Resize(weight_tensor->dims()); TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get()); @@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( void TensorRTEngine::freshDeviceId() { int count; cudaGetDeviceCount(&count); - PADDLE_ENFORCE_LT(device_id_, count); + PADDLE_ENFORCE_LT(device_id_, count, + platform::errors::OutOfRange( + "Device id %d exceeds the current device count: %d.", + device_id_, count)); cudaSetDevice(device_id_); } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 1a3413657ce6fac41603d691dcdb61ddb1d6320a..a85ed483c1d12c3f2eecc5ed4bcb99937397a765 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -196,8 +196,10 @@ class TensorRTEngine { } nvinfer1::IHostMemory* Serialize() { - PADDLE_ENFORCE(infer_engine_ != nullptr, - "You should build engine first and then serialize"); + PADDLE_ENFORCE_NOT_NULL( + infer_engine_, + platform::errors::InvalidArgument( + "The TensorRT engine must be built first before serialization")); ihost_memory_.reset(infer_engine_->serialize()); return ihost_memory_.get(); } @@ -222,8 +224,14 @@ class TensorRTEngine { engine_serialized_data.c_str(), engine_serialized_data.size(), &inference::Singleton::Global())); } - PADDLE_ENFORCE(infer_engine_ != nullptr, - "build cuda engine failed when deserialize engine info.!"); + PADDLE_ENFORCE_NOT_NULL( + infer_engine_, + platform::errors::Fatal( + "Building TRT cuda engine failed when deserializing engine info. " + "Please check:\n1. Your TRT serialization is generated and loaded " + "on the same GPU architecture;\n2. The Paddle Inference version of " + "generating serialization file and doing inference are " + "consistent.")); } void SetRuntimeBatch(size_t batch_size); diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 0ec803fe64afadd970777e3b0d0ab5d37fcc4d22..457d9dd87375477926480bce0a84e8f89c409698 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data, nvinfer1::Dims ElementWisePlugin::getOutputDimensions( int index, const nvinfer1::Dims *input_dims, int num_inputs) { - PADDLE_ENFORCE_EQ(index, 0); - PADDLE_ENFORCE_EQ(num_inputs, 2); - PADDLE_ENFORCE_NOT_NULL(input_dims); + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "There is only one output in TRT elementwise " + "op plugin, but got output index: %d.", + index)); + PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument( + "There are 2 inputs in TRT elementwise " + "op plugin, but got input number: %d.", + num_inputs)); + PADDLE_ENFORCE_NOT_NULL( + input_dims, + platform::errors::InvalidArgument( + "The input dims of TRT elementwise op plugin should not be null.")); return input_dims[0]; } int ElementWisePlugin::initialize() { - PADDLE_ENFORCE_GT(dims_y_.nbDims, 0); + PADDLE_ENFORCE_GT(dims_y_.nbDims, 0, + platform::errors::InvalidArgument( + "The dimension of input Y of TRT elementwise op plugin " + "should be greater than 0, but got %d.", + dims_y_.nbDims)); axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; int trimed_nb_dims = dims_y_.nbDims; @@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() { } dims_y_.nbDims = trimed_nb_dims; - PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_); - PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims); + PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_, + platform::errors::InvalidArgument( + "We expect [number of x dims] >= [number of y dims + " + "axis] in TRT elementwise op plugin, but got [number " + "of x dims] = %d, [number of y dims + axis] = %d.", + dims_x_.nbDims, dims_y_.nbDims + axis_)); + PADDLE_ENFORCE_LT( + axis_, dims_x_.nbDims, + platform::errors::InvalidArgument("We expect [axis] < [number of x dims] " + "in TRT elementwise op plugin, but got " + "[axis] = %d, [number of x dims] = %d.", + axis_, dims_x_.nbDims)); prev_size_ = 1; midd_size_ = 1; @@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() { for (int i = 0; i < dims_y_.nbDims; ++i) { PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i], - "Broadcast dimension mismatch."); + platform::errors::InvalidArgument( + "Broadcast dimension mismatch. The dims of input Y " + "should be a subsequence of X.")); midd_size_ *= dims_y_.d[i]; } @@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue( elementwise_kernel<<>>( num, x, y, out, prev_size, midd_size, post_size, details::Mul()); } else { - PADDLE_THROW("Not implemented."); + PADDLE_THROW(platform::errors::Unimplemented( + "Paddle-TRT only support elementwise operation: {add, mul} currently, " + "but got %s.", + type_)); } return cudaGetLastError() != cudaSuccess; diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index a03dd45db0f80487cb4c2e6b68f94944e8558ae4..72962c733ecf6a7bc6871fd3a5c65d6156b084d4 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) { nvinfer1::DimsCHW{1, 1, 1}); auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, weight.get(), bias.get()); - PADDLE_ENFORCE(fc_layer != nullptr); + PADDLE_ENFORCE_NOT_NULL(fc_layer, + platform::errors::InvalidArgument( + "TRT fully connected layer building failed.")); engine_->DeclareOutput(fc_layer, 0, "y"); LOG(INFO) << "freeze network"; @@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { nvinfer1::DimsCHW{1, 2, 1}); auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, weight.get(), bias.get()); - PADDLE_ENFORCE(fc_layer != nullptr); + PADDLE_ENFORCE_NOT_NULL(fc_layer, + platform::errors::InvalidArgument( + "TRT fully connected layer building failed.")); engine_->DeclareOutput(fc_layer, 0, "y"); engine_->FreezeNetwork(); @@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) { auto *conv_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, weight.get(), bias.get()); - PADDLE_ENFORCE(conv_layer != nullptr); + PADDLE_ENFORCE_NOT_NULL(conv_layer, + platform::errors::InvalidArgument( + "TRT convolution layer building failed.")); conv_layer->setStride(nvinfer1::DimsHW{1, 1}); conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); @@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) { auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, nvinfer1::DimsHW{2, 2}); - PADDLE_ENFORCE(pool_layer != nullptr); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::InvalidArgument("TRT pooling layer building failed.")); pool_layer->setStride(nvinfer1::DimsHW{1, 1}); pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc index 34b7072b2ee688c2ac01229ff5d3a234af3680b5..743f7740e5faaa1991172ef2a8d1cd38ad47fab5 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch( engine_name_, it.first)); } const auto& d = dataptr->second; - PADDLE_ENFORCE( - cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), - "Fail to cudaMemcpy %s for %s", engine_name_, it.first); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice)); } data_is_set_ = true; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index fd4b1a54d2b44c5bb99d3aa2e0c776c806fb5c8d..b3ec4b5714eb17032039eb234e148cdbd38c7877 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -342,9 +342,9 @@ if(WITH_MKLDNN) ### Lexcial analysis GRU model set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru") download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz") - download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz") + download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz") set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin") - set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model") + set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2") set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis") set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc") diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc index 31701c59ec33dfced5745f7f16d8f00ffce462ef..9ae073e9e5b142254b32396e0355f59ae1826909 100644 --- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc @@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) { AnalysisConfig config; config.EnableUseGpu(100, 0); config.SetModel(model_dir + "/model", model_dir + "/params"); - config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); + config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true); std::vector inputs; auto predictor = CreatePaddlePredictor(config); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 6e8ff52ed4a8846f5f6060e10cfd9bec22308e9e..f0a04d850dff01e0776e96bbe518cde2ce8bb88b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -45,7 +45,9 @@ endif() SET(OP_HEADER_DEPS xxhash executor) if (WITH_GPU) - SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) + endif() endif() SET(OP_PREFETCH_DEPS "") diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc deleted file mode 100644 index 7f0ca1493f712f7f4809a56bf6a23f8757f94c2d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" - -#include -#include - -namespace paddle { -namespace operators { - -class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel { - public: - AmpCheckFiniteAndScaleOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", - "amp_check_finite_and_unscale"); - OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", - "amp_check_finite_and_unscale"); - PADDLE_ENFORCE_EQ( - ctx->Inputs("X").size(), ctx->Outputs("Out").size(), - platform::errors::InvalidArgument( - "The input(X) and output(Out) should have same size in " - "Operator(amp_check_finite_and_unscale), size of input(X) is %d " - "and size of output(Out) is %d.", - ctx->Inputs("X").size(), ctx->Outputs("Out").size())); - auto x_dims = ctx->GetInputsDim("X"); - ctx->SetOutputsDim("Out", x_dims); - ctx->SetOutputDim("FoundInfinite", {1}); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); - } -}; - -class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "X", - "(Tensors) The input tensors of amp_check_finite_and_scale operator.") - .AsDuplicable(); - AddInput("Scale", - "(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale " - "operator."); - AddOutput("Out", - "(Tensors) The scaled output tensor of " - "amp_check_finite_and_unscale operator.") - .AsDuplicable(); - AddOutput("FoundInfinite", - "(Tensor) 1-dim tensor, contains a bool scalar, which indicates " - "if there there is infinite or nan item in input X."); - AddComment(R"DOC( -amp_check_finite_and_scale operator. -Check if input X contains all finite data, if yes, scale it by input Scale. - -$$Out = X * scale$$ - -If any tensor in X contains Inf or Nan, the Out will generate a indicator. -FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of -Out should not be used, and its data may not be deterministic. -Otherwise, FoundInfinite will be 0 (False). - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR( - amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp, - ops::AmpCheckFiniteAndScaleOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - amp_check_finite_and_scale, - ops::AmpCheckFiniteAndScaleKernel, - ops::AmpCheckFiniteAndScaleKernel); diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h deleted file mode 100644 index 6c2c4eb8a615c4c04a98601c25b5de43b4262e6b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/isfinite_op.h" - -namespace paddle { -namespace operators { - -template -class AmpCheckFiniteAndScaleKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - const auto xs = ctx.MultiInput("X"); - const auto* scale = ctx.Input("Scale"); - auto outs = ctx.MultiOutput("Out"); - auto* found_inf = ctx.Output("FoundInfinite"); - - const T* scale_data = scale->data(); - bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); - - *found_inf_data = false; - framework::Tensor is_finite = - ctx.AllocateTmpTensor({1}, dev_ctx); - bool* is_finite_data = is_finite.template data(); - - auto& dev = *ctx.template device_context().eigen_device(); - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - out->mutable_data(dev_ctx.GetPlace()); - if (!(*found_inf_data)) { - framework::TensorIsfinite(*x, &is_finite); - if (*is_finite_data) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*x); - eigen_out.device(dev) = (*scale_data) * eigen_in; - } else { - *found_inf_data = true; - break; - } - } - } - return; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..51c659d5db1c33d5e2db261b998a0673f5e766cb --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel { + public: + CheckFiniteAndUnscaleOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", + "check_finite_and_unscale"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", + "check_finite_and_unscale"); + PADDLE_ENFORCE_EQ( + ctx->Inputs("X").size(), ctx->Outputs("Out").size(), + platform::errors::InvalidArgument( + "The input(X) and output(Out) should have same size in " + "Operator(check_finite_and_unscale), size of input(X) is %d " + "and size of output(Out) is %d.", + ctx->Inputs("X").size(), ctx->Outputs("Out").size())); + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->SetOutputDim("FoundInfinite", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensors) The input tensors of check_finite_and_unscale operator.") + .AsDuplicable(); + AddInput("Scale", + "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale " + "operator."); + AddOutput("Out", + "(Tensors) The scaled output tensor of " + "check_finite_and_unscale operator.") + .AsDuplicable(); + AddOutput("FoundInfinite", + "(Tensor) 1-dim tensor, contains a bool scalar, which indicates " + "if there there is infinite or nan item in input X."); + AddComment(R"DOC( +check_finite_and_unscale operator. +Check if input X contains all finite data, if yes, scale it by input Scale. + +$$Out = X / scale$$ + +If any tensor in X contains Inf or Nan, the Out will generate a indicator. +FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of +Out should not be used, and its data may not be deterministic. +Otherwise, FoundInfinite will be 0 (False). + +)DOC"); + } +}; + +template +class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + const T* scale_data = scale->data(); + bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); + + *found_inf_data = false; + framework::Tensor is_finite = + ctx.AllocateTmpTensor({1}, dev_ctx); + bool* is_finite_data = is_finite.template data(); + + auto& dev = *ctx.template device_context() + .eigen_device(); + + T inverse_scale = Inverse(*scale_data); + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(dev_ctx.GetPlace()); + if (!(*found_inf_data)) { + framework::TensorIsfinite(*x, &is_finite); + *found_inf_data = !(*is_finite_data); + } + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*x); + if (!(*found_inf_data)) { + eigen_out.device(dev) = eigen_in * inverse_scale; + } else { + eigen_out.device(dev) = eigen_in * static_cast(0); + } + } + return; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp, + ops::CheckFiniteAndUnscaleOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleCpuKernel, + ops::CheckFiniteAndUnscaleCpuKernel); diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu similarity index 63% rename from paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu rename to paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index ee00d7c5f4499867c2c706ddcf314c1bfae0a866..cf9df34a2467f8461c4c284b4848c54b76edf452 100644 --- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -14,28 +14,31 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" namespace paddle { namespace operators { template -__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num, - bool* found_inf, T* out) { +__global__ void GpuInverse(const T* s, T* o) { + *o = Inverse(*s); +} + +template +__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num, + bool* found_inf, T* out) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < num) { if (!isfinite(in[idx])) { - *found_inf = 1; + *found_inf = true; } - out[idx] = *found_inf ? in[idx] : in[idx] * scale[0]; + out[idx] = *found_inf ? in[idx] : in[idx] * (*scale); } } template -class AmpCheckFiniteAndScaleKernel - : public framework::OpKernel { +class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { auto& dev_ctx = ctx.template device_context(); @@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool)); + framework::Tensor inverse_scale = + ctx.AllocateTmpTensor({1}, dev_ctx); + T* inverse_scale_v = inverse_scale.template data(); + + GpuInverse<<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v); + for (size_t i = 0; i < xs.size(); ++i) { const auto* x = xs[i]; auto* out = outs[i]; @@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel T* out_data = out->mutable_data(dev_ctx.GetPlace()); int num = x->numel(); - int block = 512; + int block = 1024; int grid = (num + block - 1) / block; VLOG(3) << "launch kernel"; - AmpCheckFiniteAndScale<<>>( - x_data, scale_data, num, found_inf_data, out_data); + CheckFiniteAndUnscale<<>>( + x_data, inverse_scale_v, num, found_inf_data, out_data); VLOG(3) << "finish kernel"; } } @@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - amp_check_finite_and_scale, - ops::AmpCheckFiniteAndScaleKernel, - ops::AmpCheckFiniteAndScaleKernel); +REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleGpuKernel, + ops::CheckFiniteAndUnscaleGpuKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4fb8744d0eee3c58f2948c5a466e08c2700b4332 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/operators/isfinite_op.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +template +inline HOSTDEVICE T Inverse(T s) { + return 1.0 / s; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fca3c531b40550952273f03f41bbc62cbff170fc --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -0,0 +1,170 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class UpdateLossScalingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps", + "update_loss_scaling"); + OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps", + "update_loss_scaling"); + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->SetOutputDim("LossScaling", {1}); + ctx->SetOutputDim("OutGoodSteps", {1}); + ctx->SetOutputDim("OutBadSteps", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"), + ctx.device_context()); + } +}; + +class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensors) The input tensors of update_loss_scaling operator.") + .AsDuplicable(); + AddInput("FoundInfinite", + "(Tensor) 1-dim tensor, contains a bool scalar, which indicates " + "whether there is any infinite gradient."); + AddInput("PrevLossScaling", + "(Tensor) 1-dim tensor, previous loss scaling."); + AddInput("InGoodSteps", + "(Tensor) 1-dim tensor, accumulates good steps in which all " + "gradients are finite."); + AddInput("InBadSteps", + "(Tensor) 1-dim tensor, accumulates bad steps in which some " + "gradients are infinite."); + AddOutput("Out", + "(Tensors) The output tensor of update_loss_scaling operator.") + .AsDuplicable(); + AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling."); + AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps."); + AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps."); + AddAttr("incr_every_n_steps", + "A value represents increasing loss scaling every n " + "consecutive steps with finite gradients."); + AddAttr("decr_every_n_nan_or_inf", + "A value represents decreasing loss scaling every n " + "accumulated steps with nan or inf gradients."); + AddAttr("incr_ratio", + "The multiplier to use when increasing the loss scaling.") + .AddCustomChecker([](float incr_ratio) { + PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true, + platform::errors::InvalidArgument( + "'incr_ratio' should be greater than 1, but " + "the received is %f", + incr_ratio)); + }); + AddAttr( + "decr_ratio", + "The less-than-one-multiplier to use when decreasing loss scaling.") + .AddCustomChecker([](float decr_ratio) { + PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true, + platform::errors::InvalidArgument( + "'incr_ratio' should be between 0 and 1, but " + "the received is %f", + decr_ratio)); + }); + AddComment(R"DOC( +Update loss scaling according to overall gradients. If all gradients is +finite after incr_every_n_steps, loss scaling will increase by incr_ratio. +Otherwise, loss scaling will decrease by decr_ratio after +decr_every_n_nan_or_inf steps and each step some gradients are infinite. + +)DOC"); + } +}; + +template +class UpdateLossScalingFunctor { + public: + void operator()(const platform::CPUDeviceContext& ctx, + const bool* found_inf_data, const T* pre_loss_scaling_data, + const int* good_in_data, const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, T* updated_loss_scaling_data, + int* good_out_data, int* bad_out_data) const { + Update(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, + decr_ratio, updated_loss_scaling_data, good_out_data, + bad_out_data); + } +}; + +template +class LazyZeroInputs { + public: + void operator()(const platform::CPUDeviceContext& dev_ctx, + const bool* found_inf_data, + const std::vector& xs, + const std::vector& outs) const { + if (*found_inf_data) { + VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int num = out->numel(); + std::memset(out_data, 0, num * sizeof(T)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR( + update_loss_scaling, ops::UpdateLossScalingOp, + ops::UpdateLossScalingOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(update_loss_scaling, + ops::UpdateLossScalingKernel, + ops::UpdateLossScalingKernel); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2bc60423d247447adf18eb3ef050ca9b395a2e2f --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -0,0 +1,84 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +__global__ void GpuUpdateLossScaling( + const bool* found_inf_data, const T* pre_loss_scaling_data, + const int* good_in_data, const int* bad_in_data, + const int incr_every_n_steps, const int decr_every_n_nan_or_inf, + const float incr_ratio, const float decr_ratio, + T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) { + Update(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, + updated_loss_scaling_data, good_out_data, bad_out_data); +} + +template +class UpdateLossScalingFunctor { + public: + void operator()(const platform::CUDADeviceContext& dev_ctx, + const bool* found_inf_data, const T* pre_loss_scaling_data, + const int* good_in_data, const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, T* updated_loss_scaling_data, + int* good_out_data, int* bad_out_data) const { + GpuUpdateLossScaling<<<1, 1, 0, dev_ctx.stream()>>>( + found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, + updated_loss_scaling_data, good_out_data, bad_out_data); + } +}; + +template +class LazyZeroInputs { + public: + void operator()(const platform::CUDADeviceContext& dev_ctx, + const bool* found_inf_data, + const std::vector& xs, + const std::vector& outs) const { + const auto gpu_place = + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + bool has_inf{false}; + memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data, + sizeof(bool), dev_ctx.stream()); + if (has_inf) { + VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int num = out->numel(); + cudaMemset(out_data, 0, num * sizeof(T)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using GPU = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(update_loss_scaling, + ops::UpdateLossScalingKernel, + ops::UpdateLossScalingKernel); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ca23b72eff0e85ab94c4d1f11e986f69b4e2d776 --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h @@ -0,0 +1,123 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/errors.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +HOSTDEVICE void Update(const bool* found_inf_data, + const T* pre_loss_scaling_data, const int* good_in_data, + const int* bad_in_data, const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, + const float incr_ratio, const float decr_ratio, + T* updated_loss_scaling_data, int* good_out_data, + int* bad_out_data) { + if (*found_inf_data) { + *good_out_data = 0; + *bad_out_data = *bad_in_data + 1; + if (*bad_out_data == decr_every_n_nan_or_inf) { + T new_loss_scaling = *pre_loss_scaling_data * decr_ratio; + *updated_loss_scaling_data = new_loss_scaling < static_cast(1) + ? static_cast(1) + : new_loss_scaling; + *bad_out_data = 0; + } + } else { + *bad_out_data = 0; + *good_out_data = *good_in_data + 1; + if (*good_out_data == incr_every_n_steps) { + T new_loss_scaling = *pre_loss_scaling_data * incr_ratio; + *updated_loss_scaling_data = std::isfinite(new_loss_scaling) + ? new_loss_scaling + : *pre_loss_scaling_data; + *good_out_data = 0; + } + } +} + +template +class UpdateLossScalingFunctor { + public: + void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data, + const T* pre_loss_scaling_data, const int* good_in_data, + const int* bad_in_data, const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, T* updated_loss_scaling_data, + int* good_out_data, int* bad_out_data) const; +}; + +template +class LazyZeroInputs { + public: + void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data, + const std::vector& xs, + const std::vector& outs) const; +}; + +template +class UpdateLossScalingKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto xs = ctx.MultiInput("X"); + const auto* found_inf = ctx.Input("FoundInfinite"); + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto outs = ctx.MultiOutput("Out"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + + const bool* found_inf_data = found_inf->data(); + const T* pre_loss_scaling_data = pre_loss_scaling->data(); + const int* good_in_data = good_in->data(); + const int* bad_in_data = bad_in->data(); + + auto& dev_ctx = ctx.template device_context(); + T* updated_loss_scaling_data = + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + int* good_out_data = good_out->mutable_data(dev_ctx.GetPlace()); + int* bad_out_data = bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + UpdateLossScalingFunctor{}( + dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data, + bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, + decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data); + LazyZeroInputs{}(dev_ctx, found_inf_data, xs, outs); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 60f29ba39a8ee64f9fe5d95e685cac1fb52dfd21..4940649c2a32649a068c364081071ac840b4e25a 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); // CompareOp kernel's device type is decided by input tensor place bool force_cpu = ctx.Attr("force_cpu"); - kt.place_ = force_cpu ? platform::CPUPlace() - : ctx.Input("X")->place(); + if (force_cpu) { + kt.place_ = platform::CPUPlace(); + } else { + if (ctx.Input("X")->place().type() != + typeid(platform::CUDAPinnedPlace)) { + kt.place_ = ctx.Input("X")->place(); + } else { + kt.place_ = ctx.GetPlace(); + } + } return kt; } }; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 537063640e4ef6e49f7b991482f0f3122ecef02f..c2b7c27ab4adb5282ad7aa5f7a16c15f81ba5f5e 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) if(WITH_GPU) - detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) - detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) - detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub) + set(TMPDEPS memory) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + set(TMPDEPS memory cub) + endif() + detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS}) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS}) + detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS}) else() detection_library(generate_proposals_op SRCS generate_proposals_op.cc) detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h index ca03400cfd1ef9a27ba8e725381515d5e4ebc0ba..a2279e40623b4ba2f0421e73a6148b89eb970e71 100644 --- a/paddle/fluid/operators/dist_op.h +++ b/paddle/fluid/operators/dist_op.h @@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) { } else if (p == INFINITY || p == -INFINITY) { // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if // j!=i, or equals to sign(z_i) * dout if j=i. - grad_t.device(place) = - (x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast() * - sign * out_grad_t.broadcast(out_bcast_dims); + if (platform::is_cpu_place(context.GetPlace())) { + grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) + .template cast() * + sign.eval() * out_grad_t.broadcast(out_bcast_dims); + } else { + grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) + .template cast() * + sign * out_grad_t.broadcast(out_bcast_dims); + } } else { // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout - grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * - out_grad_t.broadcast(out_bcast_dims); + if (platform::is_cpu_place(context.GetPlace())) { + grad_t.device(place) = + (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * + sign.eval() * out_grad_t.broadcast(out_bcast_dims); + } else { + grad_t.device(place) = + (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * + out_grad_t.broadcast(out_bcast_dims); + } } Eigen::DSizes x_reshape_dims; diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc index 457d9e79d7da171ef526d5cab0e59b021cb64f98..5a398fa50febe2efffd588ce8f3612f1f9cec0b6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc @@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp, REGISTER_OP_CPU_KERNEL( elementwise_floordiv, - ops::ElementwiseFloorDivKernel, - ops::ElementwiseFloorDivKernel, ops::ElementwiseFloorDivKernel, ops::ElementwiseFloorDivKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu index f63d6f037632c1a6a05726b933b2258adc113ee3..60846d1e8fee1c7f68ac101f18355750c2c15a4d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu @@ -19,7 +19,5 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( elementwise_floordiv, - ops::ElementwiseFloorDivKernel, - ops::ElementwiseFloorDivKernel, ops::ElementwiseFloorDivKernel, ops::ElementwiseFloorDivKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h index 8afe2133c0488bbe04ec4803aac5dce6573f634d..5dc93740949e6e7c25be564927c8fcffde1a18d6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, framework::Tensor *z) { int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, FloorDivFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseFloorDivFunctor(), z); - } + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, FloorDivFunctor(), z); } template diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 718321b441b2025afea9d913855b26a82cda8075..e4d3ea6d7291eff8911d8419cda96f2d2738b9a1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp { auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); #ifdef PADDLE_WITH_MKLDNN - using mkldnn::memory; - auto CanMKLDNNElementwiseMulBeUsed = [&]() { - auto x_dims = ctx.Input("X")->dims(); - auto y_dims = ctx.Input("Y")->dims(); - int rankdiff = x_dims.size() - y_dims.size(); - // TODO(jczaja): Remove this when oneDNN performance for scalar - // broadcasting - // is improved (Ernie large situation) - if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) { - return false; - } - - return true; - }; - - if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) { + if (platform::CanMKLDNNBeUsed(ctx)) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, framework::LibraryType::kMKLDNN); diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f539e2e6f6d2d6faa084d1e62ec894b4b65e96bf --- /dev/null +++ b/paddle/fluid/operators/empty_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/empty_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class EmptyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("ShapeTensor", + "(Tensor), optional). The shape of the output." + "It has a higher priority than Attr(shape).") + .AsDispensable(); + AddInput("ShapeTensorList", + "(vector>, optional). The shape of the output. " + "It has a higher priority than Attr(shape)." + "The shape of the element in vector must be [1].") + .AsDuplicable() + .AsDispensable(); + AddAttr>("shape", + "(vector) The shape of the output") + .SetDefault({}); + AddAttr("dtype", "The data type of output tensor, Default is float") + .SetDefault(framework::proto::VarType::FP32); + AddOutput("Out", "(Tensor) The output tensor."); + AddComment(R"DOC(empty operator +Returns a tensor filled with uninitialized data. The shape of the tensor is +defined by the variable argument shape. + + +The type of the tensor is specify by `dtype`. +)DOC"); + } +}; + +class EmptyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* context) const override { + OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty"); + + if (context->HasInput("ShapeTensor")) { + auto dims = context->GetInputDim("ShapeTensor"); + int num_ele = 1; + for (int i = 0; i < dims.size(); ++i) { + num_ele *= dims[i]; + } + + context->SetOutputDim("Out", framework::make_ddim({num_ele})); + } else if (context->HasInputs("ShapeTensorList")) { + std::vector out_dims; + auto dims_list = context->GetInputsDim("ShapeTensorList"); + for (size_t i = 0; i < dims_list.size(); ++i) { + auto& dims = dims_list[i]; + PADDLE_ENFORCE_EQ( + dims, framework::make_ddim({1}), + "ShapeError: The shape of Tensor in list must be [1]. " + "But received the shape " + "is [%s]", + dims); + + out_dims.push_back(dims[0]); + } + + context->SetOutputDim("Out", framework::make_ddim(out_dims)); + } else { + auto& shape = context->Attrs().Get>("shape"); + context->SetOutputDim("Out", framework::make_ddim(shape)); + } + } + + protected: + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") { + return expected_kernel_type; + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& context) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(context.Attr("dtype")), + context.GetPlace()); + } +}; + +class EmptyOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* context) const override { + auto data_type = static_cast( + BOOST_GET_CONST(int, context->GetAttr("dtype"))); + context->SetOutputDataType("Out", data_type); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR( + empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel); diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..22799e507aeff7940274f729b174f50bfd9132a5 --- /dev/null +++ b/paddle/fluid/operators/empty_op.cu.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/empty_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + empty, ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel, + ops::EmptyKernel); diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9c91377683870917db28f6f6a5f3f3b1b4a1962f --- /dev/null +++ b/paddle/fluid/operators/empty_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/utils.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class EmptyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto dtype = static_cast( + context.Attr("dtype")); + + Tensor *out_tensor = context.Output("Out"); + + auto shape = GetShape(context); + out_tensor->Resize(shape); + + out_tensor->mutable_data(context.GetPlace(), dtype); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 3c898ac29f0cab572d199eaafe951751682d4834..83e205367a7af62c52825297d92571c306be2c42 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker { } }; +template +class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + if (this->HasInput("expand_times_tensor")) { + op->SetInput("expand_times_tensor", this->Input("expand_times_tensor")); + } + if (this->HasInput("ExpandTimes")) { + op->SetInput("ExpandTimes", this->Input("ExpandTimes")); + } + op->SetAttrMap(this->Attrs()); + op->SetType("expand"); + } +}; + DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X"); } // namespace operators @@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, ops::ExpandGradOpMaker, ops::ExpandGradOpMaker); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp, + ops::ExpandDoubleGradOpMaker, + ops::ExpandDoubleGradOpMaker, ops::ExpandGradNoNeedBufVarsInferer); REGISTER_OP_CPU_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index 359d512c341529579a56dbe840e5eef0aa3062a5..a1ee47b7f93910a481c6e0793c306e2b190c774d 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker { } }; +template +class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("expand_v2"); + op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + if (this->HasInput("expand_shapes_tensor")) { + op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor")); + } + if (this->HasInput("Shape")) { + op->SetInput("Shape", this->Input("Shape")); + } + op->SetAttrMap(this->Attrs()); + } +}; + DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X"); } // namespace operators @@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker, ops::ExpandV2GradOpMaker, ops::ExpandV2GradOpMaker); REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp, + ops::ExpandV2DoubleGradOpMaker, + ops::ExpandV2DoubleGradOpMaker, ops::ExpandV2GradNoNeedBufVarsInferer); REGISTER_OP_CPU_KERNEL( expand_v2, ops::ExpandV2Kernel, diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 74939da08b38dc147c156011759757a605db9444..6fea8fe98bf0e19bbbb023c91f4f9900f5ec1859 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -27,27 +27,6 @@ namespace operators { using Tensor = framework::Tensor; -inline framework::DDim GetShape(const framework::ExecutionContext &ctx, - std::string op_type) { - // 1. shape is a Tensor - if (ctx.HasInput("ShapeTensor")) { - auto *shape_tensor = ctx.Input("ShapeTensor"); - auto vec_shape = GetDataFromTensor(shape_tensor); - return framework::make_ddim(vec_shape); - } - - // 2. shape is a list/tuple containing Tensor - auto shape_tensor_list = ctx.MultiInput("ShapeTensorList"); - if (shape_tensor_list.size() > 0) { - auto vec_shape = GetDataFromTensorList(shape_tensor_list); - return framework::make_ddim(vec_shape); - } - - // 3. shape is a list/tuple without containing Tensor - auto vec_shape = ctx.Attr>("shape"); - return framework::make_ddim(vec_shape); -} - template class FillConstantKernel : public framework::OpKernel { public: @@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel { } value = tensor_data[0]; } - const std::string op_type = "fill_constant"; - auto shape = GetShape(ctx, op_type); + auto shape = GetShape(ctx); if (out_var->IsType()) { tensor = out_var->GetMutable(); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index c698cb1405fd6f049e01b23613e175ba39c4976e..79fa268f3884b2710fe08eb2907dbd989479d7e6 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto blas = math::GetBlas(ctx); for (int64_t i = 0; i < ids_numel; ++i) { - PADDLE_ENFORCE_LT(ids_data[i], row_number); - PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + PADDLE_ENFORCE_LT( + ids_data[i], row_number, + platform::errors::OutOfRange( + "Value of Ids %d should less than dict size %d.", i, row_number)); + PADDLE_ENFORCE_GE(ids_data[i], 0, + platform::errors::OutOfRange( + "Value of Ids %d should greater than ZERO.", i)); memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, row_width * sizeof(T)); } @@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto blas = math::GetBlas(dev_ctx); for (int64_t i = 0; i < ids_numel; ++i) { - PADDLE_ENFORCE_LT(ids_data[i], row_number); - PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + PADDLE_ENFORCE_LT( + ids_data[i], row_number, + platform::errors::OutOfRange( + "Value of Ids %d should less than dict size %d.", i, row_number)); + PADDLE_ENFORCE_GE(ids_data[i], 0, + platform::errors::OutOfRange( + "Value of Ids %d should greater than ZERO.", i)); memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, row_width * sizeof(T)); } diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index d0920098f606e49d4d1a3e4cb6d8a2b6c44ca267..4013906609603e31b798e333d55ecccba197506a 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru"); - OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru"); - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, - platform::errors::InvalidArgument( - "Input(X)'s rank must be 2, but received input dim " - "size is:%d, input dim is:[%s]", - x_dims.size(), x_dims)); + auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) + ? framework::flatten_to_2d(x_dims, 1) + : x_dims; + PADDLE_ENFORCE_EQ( + x_mat_dims.size(), 2, + platform::errors::InvalidArgument("The size of input X dims should be 2, " + "or 3 with second dimension equal to " + "1, but now Input X dim is:[%s] ", + x_dims)); auto wx_dims = ctx->GetInputDim("WeightX"); PADDLE_ENFORCE_EQ(wx_dims.size(), 2, @@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { "The rank of Input(WeightX) should be 2, but received " "WeightX dim size is:%d, WeightX dim is:[%s] ", wx_dims.size(), wx_dims)); - PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], - platform::errors::InvalidArgument( - "The first dimension of Input(WeightX) " - "should equal to second dimension of input x, but " - "received WeightX dimension is:%d, x dimension is:%d", - wx_dims[0], x_dims[1])); + PADDLE_ENFORCE_EQ( + wx_dims[0], x_mat_dims[1], + platform::errors::InvalidArgument( + "The first dimension of flattened WeightX" + "should equal to last dimension of flattened input X, but " + "received fattened WeightX dimension is:%d, flattened X dimension " + "is:%d", + wx_dims[0], x_mat_dims[1])); int frame_size = wx_dims[1] / 3; auto wh_dims = ctx->GetInputDim("WeightH"); @@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { "received bias dim is:[%s], frame size is:%d", b_dims, frame_size)); } - framework::DDim out_dims({x_dims[0], frame_size}); + framework::DDim out_dims({x_mat_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); ctx->ShareLoD("X", "Hidden"); int xx_width; if (ctx->Attrs().Get("use_seq")) { xx_width = wx_dims[1]; } else { - xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1]; OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut", "fusion_gru"); - ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedOut", out_dims); } - ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width}); ctx->ShareLoD("X", "XX"); } @@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "int8", "bfloat16"}); + AddAttr("Scale_data", + "Scale to be used for int8 input/output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Shift_data", + "Shift to be used for int8 input/output data." + "Only used with MKL-DNN INT8.") + .SetDefault(0.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); AddComment(R"DOC( The Fusion complete GRU Operator. This operator fuse the fully-connected operator into GRU, @@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel { } } -#define INIT_BASE_DEFINES \ - auto* x = ctx.Input("X"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* xx = ctx.Output("XX"); \ - auto x_lod = x->lod(); \ - auto x_dims = x->dims(); /* T x M*/ \ - auto wh_dims = wh->dims(); /* D x 3D*/ \ - const int total_T = x_dims[0]; \ +#define INIT_BASE_DEFINES \ + auto* x = ctx.Input("X"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* xx = ctx.Output("XX"); \ + auto x_lod = x->lod(); \ + auto x_dims = x->dims(); /* T x M*/ \ + auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) \ + ? framework::flatten_to_2d(x_dims, 1) \ + : x_dims; \ + auto wh_dims = wh->dims(); /* D x 3D*/ \ + const int total_T = x_mat_dims[0]; \ const int D3 = wh_dims[1] #define INIT_OTHER_DEFINES \ @@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel { auto* bias = ctx.Input("Bias"); \ auto* hidden_out = ctx.Output("Hidden"); \ bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ + const int M = x_mat_dims[1]; \ const int D = wh_dims[0]; \ const int D2 = D * 2; \ const jit::gru_attr_t attr( \ diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 3940aae53b8ef70c15311305ce13f8929400d405..5fad1b116de6437e62e311318832ad77e24a40cc 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -21,11 +21,12 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; using paddle::platform::CPUDeviceContext; +using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; using platform::to_void_cast; -template +template class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { public: GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, @@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { const std::string& unique_name) : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(unique_name, Ti)), + CreateKey(unique_name, MKLDNNGetDataType(), Ti)), N(N), Ti(Ti), IC(IC), @@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { // do not depend on Ti size but primitive and input/output memory do if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() != platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) { - memory_key_ = unique_name; + memory_key_ = CreateKey(unique_name, MKLDNNGetDataType()); } else { - memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr(); + memory_key_ = CreateKey(unique_name, MKLDNNGetDataType(), "-t:", + platform::ThreadIDasStr()); + } + + // Is it int8 kernel + const bool is_INT8 = std::is_same::value; + + if (is_INT8) { + // Int8 attributes + const float scale_data = ctx.Attr("Scale_data"); + const float shift_data = ctx.Attr("Shift_data"); + const auto scale_weights = ctx.Attr>("Scale_weights"); + + const int weights_scale_mask = + 0 + + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` + + + (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` + + attr_.set_rnn_data_qparams(scale_data, shift_data); + attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights); } if (!this->isCached()) { @@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { platform::errors::Unimplemented( "oneDNN fusion_gru supports only tanh as an activation.")); + // Weights for int8 kernel are of a type s8 + const auto weights_dt = + is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32; + // oneDNN RNN dimensions const int64_t D = 1; // Directions const int64_t L = 1; // Layers (PP supports only 1 stacked layer) @@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { // Create memory descriptors auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - auto weight_x_md = MKLDNNMemDesc( - {L, D, IC, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - auto weight_h_md = MKLDNNMemDesc( - {L, D, OC, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + auto weight_x_md = + MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); + auto weight_h_md = + MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldgo); - auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType(), + auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - auto h0_md = dnnl::memory::desc(); - if (h0) { - h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldnc); - } + auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc); // Create GRU oneDNN primitive const auto direction = @@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { : dnnl::rnn_direction::unidirectional_left2right; this->AcquireForwardPrimitiveDescriptor( - dnnl::prop_kind::forward_inference, direction, input_md, h0_md, + attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md, weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc()); } } @@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { dnnl::memory::format_tag::ntc); } - void reorderRNNdata(const T* input_data, T* output_data, + void reorderRNNdata(void* input_data, void* output_data, std::vector lod, const bool is_reverse, platform::RNNReorderType reorder_type) { switch (reorder_type) { // Reorder input memory [WORDS, C] + LoD -> [N, T, C] case platform::RNNReorderType::PP_NTC: { - auto* input_data_iter = input_data; + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N; ++n) { const auto num_elements = (lod[n + 1] - lod[n]) * IC; const auto offset = is_reverse ? (Ti * IC - num_elements) : 0; - memcpy(output_data + n * Ti * IC + offset, input_data_iter, + memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter, sizeof(T) * num_elements); input_data_iter += num_elements; } } break; // Reorder input memory [WORDS, C] + LoD -> [T, N, C] case platform::RNNReorderType::PP_TNC: { - auto* input_data_iter = input_data; + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N; ++n) { const auto num_elements = (lod[n + 1] - lod[n]); const auto offset = is_reverse ? (Ti - num_elements) : 0; for (size_t t = 0; t < num_elements; ++t) { - memcpy(output_data + (t + offset) * N * IC + n * IC, + memcpy(output_data_iter + (t + offset) * N * IC + n * IC, input_data_iter, sizeof(T) * IC); input_data_iter += IC; } @@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { } break; // Reorder output values to PP format [N, T, C] -> [WORDS, C] case platform::RNNReorderType::NTC_PP: { - auto* output_data_iter = output_data; + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N; ++n) { const auto num_elements = (lod[n + 1] - lod[n]) * OC; const auto offset = is_reverse ? (Ti * OC - num_elements) : 0; - memcpy(output_data_iter, input_data + n * Ti * OC + offset, - sizeof(T) * num_elements); + memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset, + sizeof(T_out) * num_elements); output_data_iter += num_elements; } } break; // Reorder output values to PP format [T, N, C] -> [WORDS, C] case platform::RNNReorderType::TNC_PP: { - auto* output_data_iter = output_data; + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N; ++n) { const auto num_elements = lod[n + 1] - lod[n]; const auto offset = is_reverse ? (Ti - num_elements) : 0; for (size_t t = 0; t < num_elements; ++t) { memcpy(output_data_iter, - input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC); + input_data_iter + (t + offset) * N * OC + n * OC, + sizeof(T_out) * OC); output_data_iter += OC; } } @@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { } const auto& input_lod = input->lod()[0]; - auto* x_data = input->data(); + auto* x_data = to_void_cast(input->data()); - auto* x_onednn_data = reinterpret_cast(memory_p->get_data_handle()); + auto* x_onednn_data = memory_p->get_data_handle(); memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC); if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) == @@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { return memory_p; } + // TODO(grygielski) H0 is for now persistable std::shared_ptr AcquireH0Memory(const Tensor* h0) { const std::string h0_key = memory_key_ + "@h0"; auto memory_p = std::static_pointer_cast(this->dev_ctx_.GetBlob(h0_key)); - auto* h0_data = to_void_cast(h0->data()); - if (!memory_p) { - memory_p = std::make_shared( - this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data); + auto user_h0_memory = dnnl::memory(); + if (h0) { + user_h0_memory = + dnnl::memory({{1, 1, N, OC}, + MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc}, + this->engine_, to_void_cast(h0->data())); + } else { + user_h0_memory = dnnl::memory({{1, 1, N, OC}, + MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc}, + this->engine_); + memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC); + } + memory_p = std::make_shared(this->fwd_pd_->src_iter_desc(), + this->engine_); + + dnnl::stream astream(this->engine_); + dnnl::reorder(user_h0_memory, *memory_p, attr_) + .execute(astream, user_h0_memory, *memory_p); + this->dev_ctx_.SetBlob(h0_key, memory_p); - } else { - memory_p->set_data_handle(h0_data); } return memory_p; } @@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { this->fwd_pd_->weights_layer_desc(), this->engine_); dnnl::stream astream(this->engine_); - dnnl::reorder(user_memory, *memory_p) + dnnl::reorder(user_memory, *memory_p, attr_) .execute(astream, user_memory, *memory_p); this->dev_ctx_.SetBlob(wx_key, memory_p); @@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { this->fwd_pd_->weights_iter_desc(), this->engine_); dnnl::stream astream(this->engine_); - dnnl::reorder(user_memory, *memory_p) + dnnl::reorder(user_memory, *memory_p, attr_) .execute(astream, user_memory, *memory_p); this->dev_ctx_.SetBlob(wh_key, memory_p); @@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { // Memory size of weights, bias and h0 does not depend // on Ti size, thus we need another key to cache them std::string memory_key_; + dnnl::primitive_attr attr_; }; template class FusionGRUMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const bool is_INT8 = std::is_same::value; + const bool force_fp32_output = ctx.Attr("force_fp32_output"); + + // TODO(grygielski) Add option for bfloat + if (!is_INT8 || force_fp32_output) { + RunKernel(ctx); + } else { + RunKernel(ctx); + } + } + + template + void RunKernel(const framework::ExecutionContext& ctx) const { auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { const auto* weight_h = ctx.Input("WeightH"); const auto* bias = ctx.Input("Bias"); auto* hidden = ctx.Output("Hidden"); - + auto x_dims = input->dims(); + auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) + ? framework::flatten_to_2d(x_dims, 1) + : x_dims; // Get attributes const bool is_reverse = ctx.Attr("is_reverse"); const bool origin_mode = ctx.Attr("origin_mode"); // Get tensor dimensions - const auto x_dims = framework::vectorize(input->dims()); + const auto x_mat_dims_vec = framework::vectorize(x_mat_dims); const auto weight_h_dims = framework::vectorize(weight_h->dims()); const auto& input_lod = input->lod()[0]; @@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { } return res; }(); - const int64_t IC = x_dims[1]; // Input channels - const int64_t OC = weight_h_dims[0]; // Output channels + const int64_t IC = x_mat_dims_vec[1]; // Input channels + const int64_t OC = weight_h_dims[0]; // Output channels - GRUMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), - input, weight_h, h0, is_reverse, N, Ti, IC, OC, - ctx.InputName("X") + ctx.InputName("WeightH")); + GRUMKLDNNHandler handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0, + is_reverse, N, Ti, IC, OC, + ctx.InputName("X") + ctx.InputName("WeightH")); auto input_memory_p = handler.AcquireInputMemoryWithReorder(input, is_reverse); + auto h0_memory_p = handler.AcquireH0Memory(h0); auto weight_x_memory_p = handler.AcquireWeightXMemory(weight_x, origin_mode); auto weight_h_memory_p = @@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { std::unordered_map gru_args = { {DNNL_ARG_SRC_LAYER, *input_memory_p}, + {DNNL_ARG_SRC_ITER, *h0_memory_p}, {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p}, {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p}, {DNNL_ARG_BIAS, *bias_memory_p}, {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}}; - if (h0) { - auto h0_memory_p = handler.AcquireH0Memory(h0); - gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p}); - } - auto gru_forward_p = handler.AcquireForwardPrimitive(); dnnl::stream astream(mkldnn_engine); gru_forward_p->execute(astream, gru_args); astream.wait(); - auto* hidden_onednn_data = - reinterpret_cast(hidden_onednn_memory_p->get_data_handle()); - auto* hidden_data = hidden->mutable_data(ctx.GetPlace()); + auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle(); + auto* hidden_data = + to_void_cast(hidden->mutable_data(ctx.GetPlace())); if (handler.is_NTC()) { handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod, is_reverse, platform::RNNReorderType::NTC_PP); @@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace, - ops::FusionGRUMKLDNNKernel); + ops::FusionGRUMKLDNNKernel, + ops::FusionGRUMKLDNNKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 4f128463375b91803a7a4d02a27dd78157961aac..17a71c67b8a084c114497eb97568e9b536161711 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel { auto* tensor = context.Output("Out"); std::normal_distribution dist(mean, std); - const std::string op_type = "gaussian_random"; - auto shape = GetShape(context, op_type); + auto shape = GetShape(context); tensor->Resize(shape); int64_t size = tensor->numel(); T* data = tensor->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 69c8b60040651179784cd6b77c31c66e892231be..7a0c93eb1b2eaa7afaae7f0a604a0da5ac0fd75d 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel { T mean = static_cast(context.Attr("mean")); T std = static_cast(context.Attr("std")); thrust::counting_iterator index_sequence_begin(0); - const std::string op_type = "gaussian_random"; - auto shape = GetShape(context, op_type); + auto shape = GetShape(context); tensor->Resize(shape); T* data = tensor->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index a920bf7c3f505b839f8f1fd252c9f8505393f3a9..f6d65704388e6ec90c9209475e5f4b19061085fa 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase { private: std::vector in_names_; std::vector out_names_; - paddle::lite::Predictor *engine_; + paddle::lite_api::PaddlePredictor *engine_; framework::proto::VarType::Type precision_; bool use_gpu_; bool zero_copy_; @@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase { framework::LoDTensor src_t = inference::analysis::GetFromScope(scope, in_names_[i]); - paddle::lite::Tensor *dst_t = engine_->GetInput(i); + paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i)); VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> " << engine_->GetInputNames()[i] << ")"; - inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_); + inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_); } #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(dev_place)) { @@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase { engine_->Run(); VLOG(3) << "lite engine run done"; for (size_t i = 0; i < out_names_.size(); i++) { - paddle::lite::Tensor src_t = *(engine_->GetOutput(i)); + paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i)); framework::LoDTensor *dst_t = &inference::analysis::GetFromScope( scope, out_names_[i]); diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index fb5c0dcb3514de815b97944d0fdbf3bd7853b628..76c963ac652687cb0f65a0497b5c994f82d0d7aa 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) { inference::lite::EngineConfig config; config.valid_places = { #ifdef PADDLE_WITH_CUDA - paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}), #endif - paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), - paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}), + paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}), }; serialize_params(&(config.param), &scope, repetitive_params); config.model = program.Proto()->SerializeAsString(); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 3a19c7edff3569d503480fd060a6432dc59d2108..10d335b828b516fe08871f314ba4667c06f04714 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -9,7 +9,11 @@ function(math_library TARGET) set(hip_srcs) set(math_common_deps device_context framework_proto enforce) if (WITH_GPU) - list(APPEND math_common_deps cub) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + list(APPEND math_common_deps cub) + else() + list(APPEND math_common_deps) + endif() endif() set(multiValueArgs DEPS) cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 824e66b1eb4ae05cc74dc1cd8c21f16f286592e6..f44b33fcf2fc23f79483909046dd9e292fd8dde8 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -128,9 +128,23 @@ struct RowwiseAdd { const framework::Tensor& input, const framework::Tensor& vector, framework::Tensor* output) { auto in_dims = input.dims(); + auto out_dims = output->dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); + PADDLE_ENFORCE_EQ( + vector.numel(), size, + platform::errors::InvalidArgument( + "The input vector size" + " should be equal to the size of each row of input tensor." + " Expected vector size=%d, but received %d", + size, vector.numel())); + const char* in_dims_cstr = in_dims.to_str().c_str(); + const char* out_dims_cstr = out_dims.to_str().c_str(); + PADDLE_ENFORCE_EQ(out_dims, in_dims, + platform::errors::InvalidArgument( + "The output tensor shape should be same as the input" + " tensor shape. Expected output tensor shape: %s," + " but received %s", + in_dims_cstr, out_dims_cstr)); auto in = framework::EigenMatrix::From(input); auto vec = framework::EigenVector::Flatten(vector); diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index fba143d017deb4b4814ad8b10e614357a7ebee23..1c519d226ebfe5ff19876f17b79fd36aa12c4130 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -88,9 +88,24 @@ struct RowwiseAdd { const framework::Tensor& input, const framework::Tensor& vector, framework::Tensor* output) { auto in_dims = input.dims(); + auto out_dims = output->dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); + PADDLE_ENFORCE_EQ( + vector.numel(), size, + platform::errors::InvalidArgument( + "The input vector size" + " should be equal to the size of each row of input tensor." + " Expected vector size=%d, but received %d", + size, vector.numel())); + const char* in_dims_cstr = in_dims.to_str().c_str(); + const char* out_dims_cstr = out_dims.to_str().c_str(); + PADDLE_ENFORCE_EQ( + out_dims, in_dims, + platform::errors::InvalidArgument( + "The output tensor shape should be same as the input tensor" + " shape. Expected output tensor shape: %s," + " but received %s", + in_dims_cstr, out_dims_cstr)); int blocks = 512; int grids = (input.numel() + blocks - 1) / blocks; RowwiseAddKernel<<>>( @@ -113,7 +128,12 @@ void ColwiseSum::operator()( framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector->numel(), size); + PADDLE_ENFORCE_EQ(vector->numel(), size, + platform::errors::InvalidArgument( + "The size of input vector" + " should be equal to the size of input tensor column" + " dimension. Expected vector size=%d, but received %d", + size, vector->numel())); framework::Tensor one; one.mutable_data({in_dims[0]}, context.GetPlace()); SetConstant set; @@ -134,7 +154,12 @@ void RowwiseSum::operator()( framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]); + PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0], + platform::errors::InvalidArgument( + "The size of input vector" + " should be equal to the size of input tensor row" + " dimension. Expected vector size=%d, but received %d", + in_dims[0], vector->numel())); framework::Tensor one; one.mutable_data({size}, context.GetPlace()); SetConstant set; diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index 693d5620460e1fe6f6d82bd0749b0780b64841f5..869a3054598da9cd2223ca0e705c0f910ba043ec 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -59,7 +59,12 @@ void ColwiseSum::operator()(const DeviceContext& context, framework::Tensor* out) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(out->numel(), size); + PADDLE_ENFORCE_EQ(out->numel(), size, + platform::errors::InvalidArgument( + "The size of output tensor " + "should be equal to the size of input tensor column" + " dimension. Expected output size=%d, but received %d", + size, out->numel())); auto in = framework::EigenMatrix::From(input); auto vec = framework::EigenVector::Flatten(*out); @@ -78,7 +83,13 @@ class ColwiseSum { auto& in_dims = input.dims(); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), size); + PADDLE_ENFORCE_EQ( + out->numel(), size, + platform::errors::InvalidArgument( + "The size of output tensor " + "should be equal to the size of input tensor column" + " dimension. Expected output size=%d, but received %d", + size, out->numel())); T* out_buf = out->mutable_data(out->place()); const T* in_buf = input.data(); @@ -100,8 +111,16 @@ void RowwiseMean::operator()(const DeviceContext& context, const framework::Tensor& input, framework::Tensor* out) { auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument( + "The rank of input tensor " + "should be 2, but received %d", + in_dims.size())); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0], + platform::errors::InvalidArgument( + "The size of output tensor " + "should be equal to the size of input tensor row" + " dimension. Expected output size=%d, but received %d", + in_dims[0], out->numel())); auto in = framework::EigenMatrix::From(input); auto vec = framework::EigenVector::Flatten(*out); @@ -118,10 +137,19 @@ class RowwiseMean { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* out) { auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument( + "The rank of input tensor " + "should be 2, but received %d", + in_dims.size())); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); + PADDLE_ENFORCE_EQ( + out->numel(), height, + platform::errors::InvalidArgument( + "The size of output tensor " + "should be equal to the size of input tensor row" + " dimension. Expected output size=%d, but received %d", + height, out->numel())); auto inv_size = 1.0 / size; T* out_buf = out->mutable_data(out->place()); const T* in_buf = input.data(); @@ -141,8 +169,16 @@ void RowwiseSum::operator()(const DeviceContext& context, const framework::Tensor& input, framework::Tensor* out) { auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument( + "The rank of input tensor " + "should be 2, but received %d", + in_dims.size())); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0], + platform::errors::InvalidArgument( + "The size of output tensor " + "should be equal to the size of input tensor row" + " dimension. Expected output size=%d, but received %d", + in_dims[0], out->numel())); auto in = framework::EigenMatrix::From(input); auto vec = framework::EigenVector::Flatten(*out); @@ -159,10 +195,19 @@ class RowwiseSum { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* out) { auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument( + "The rank of input tensor " + "should be 2, but received %d", + in_dims.size())); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); + PADDLE_ENFORCE_EQ( + out->numel(), height, + platform::errors::InvalidArgument( + "The size of output tensor " + "should be equal to the size of input tensor row" + " dimension. Expected output size=%d, but received %d", + height, out->numel())); T* out_buf = out->mutable_data(out->place()); const T* in_buf = input.data(); diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f..587823e535ac67f926fd469d2f43df536c8c88b6 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) { auto* ctx = new paddle::platform::CPUDeviceContext(); paddle::operators::math::set_constant(*ctx, &t, 10); for (int64_t i = 0; i < t.numel(); ++i) { - PADDLE_ENFORCE_EQ(10, t.data()[i]); + PADDLE_ENFORCE_EQ(10, t.data()[i], + paddle::platform::errors::InvalidArgument( + "Each value of input" + "tensor should be 10, but received %d.", + t.data()[i])); } delete ctx; } diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index bcbb4a8274f149240b9f0990f38d9f38bdd0e5b1..44b1ee45a4fe9b6f2ea7ba5e09c7cbc60c1aff28 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -18,7 +18,12 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, const std::vector& data) { - PADDLE_ENFORCE_EQ(size, data.size()); + PADDLE_ENFORCE_EQ( + size, data.size(), + paddle::platform::errors::InvalidArgument( + "The size of argument data should" + " be equal to the argument size. Expected %d, but received %d.", + size, data.size())); for (size_t i = 0; i < data.size(); ++i) { in_ptr[i] = paddle::platform::float16(data[i]); } diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h index 63f793433de07ea2e43ad03ea3ccae1a259f7ae2..379b21c3c18888989663221052e6e99df80e7e9d 100644 --- a/paddle/fluid/operators/math/padding.h +++ b/paddle/fluid/operators/math/padding.h @@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context, PadFunction(context, pads, src, pad_value, out); break; default: - PADDLE_THROW( - "PadOp only support tensors with no more than 6 dimensions."); + PADDLE_THROW(platform::errors::Unimplemented( + "PadOp only support tensors with no more" + " than 6 dimensions currently.")); } } @@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, PadGradFunction(context, pads, src, out); break; default: - PADDLE_THROW( - "PadOp only support tensors with no more than 6 dimensions."); + PADDLE_THROW(platform::errors::Unimplemented( + "PadOp only support tensors with no more" + " than 6 dimensions currently.")); } } diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h index 480576ef9dc8c21811a1a867d553ccc6d97fa22a..de9113f2bb616b489747d8d960154f55bb988847 100644 --- a/paddle/fluid/operators/math/sampler.h +++ b/paddle/fluid/operators/math/sampler.h @@ -19,6 +19,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/platform/enforce.h" + namespace paddle { namespace operators { namespace math { @@ -31,7 +33,10 @@ namespace math { class Sampler { public: explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) { - // PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0."); + PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument( + "Range should be" + " greater than 0, but recevied %d.", + range)); if (seed == 0) { std::random_device r; seed_ = r(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 3bb9efc5315fcacf0b50682b65c89ac3ad0d2d4e..c2595beb0cb4dc37104a91ac8a2647c7d787c5c5 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -29,7 +29,12 @@ struct SelectedRowsAdd { const framework::SelectedRows& input2, framework::SelectedRows* output) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2.height()); + PADDLE_ENFORCE_EQ( + in1_height, input2.height(), + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, input2.height())); output->set_height(in1_height); auto& in1_rows = input1.rows(); @@ -47,15 +52,31 @@ struct SelectedRowsAdd { auto& in2_value = input2.value(); auto in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); - PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + PADDLE_ENFORCE_EQ( + in1_row_numel, in2_value.numel() / in2_rows.size(), + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, in2_value.numel() / in2_rows.size())); + PADDLE_ENFORCE_EQ( + in1_row_numel, out_value->numel() / out_rows.size(), + platform::errors::InvalidArgument( + "The input and oupput width must be equal." + "But recieved input width = [%d], output width = [%d]", + in1_row_numel, out_value->numel() / out_rows.size())); auto in1_place = input1.place(); - PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the CPU place.")); auto in2_place = input2.place(); - PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the CPU place.")); auto out_place = context.GetPlace(); - PADDLE_ENFORCE(platform::is_cpu_place(out_place)); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the CPU place.")); auto* out_data = out_value->data(); auto* in1_data = in1_value.data(); @@ -82,15 +103,35 @@ struct SelectedRowsAddTensor { auto in1_height = input1.height(); auto in2_dims = input2.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); - PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + PADDLE_ENFORCE_EQ( + in1_height, in2_dims[0], + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, in2_dims[0])); + PADDLE_ENFORCE_EQ( + in1_height, out_dims[0], + platform::errors::InvalidArgument( + "The input and output height must be equal." + "But recieved input height = [%d], output height = [%d]", + in1_height, out_dims[0])); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); - PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + PADDLE_ENFORCE_EQ( + in1_row_numel, input2.numel() / in1_height, + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, input2.numel() / in1_height)); + PADDLE_ENFORCE_EQ( + in1_row_numel, output->numel() / in1_height, + platform::errors::InvalidArgument( + "The input and output width must be equal." + "But recieved input width = [%d], output width = [%d]", + in1_row_numel, output->numel() / in1_height)); SetConstant functor; functor(context, output, 0.0); @@ -121,7 +162,12 @@ struct SelectedRowsAddTo { const int64_t input2_offset, framework::SelectedRows* input2) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + PADDLE_ENFORCE_EQ( + in1_height, input2->height(), + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, input2->height())); auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); @@ -133,9 +179,13 @@ struct SelectedRowsAddTo { in2_rows.Extend(in1_rows.begin(), in1_rows.end()); auto in1_place = input1.place(); - PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the CPU place.")); auto in2_place = input2->place(); - PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the CPU place.")); auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); @@ -163,7 +213,12 @@ struct SelectedRowsSumTo { auto& in_rows = (*iter)->rows(); size += in_rows.end() - in_rows.begin(); auto in1_height = (*iter)->height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + PADDLE_ENFORCE_EQ(in1_height, input2->height(), + platform::errors::InvalidArgument( + "The two inputs height must be equal." + "But recieved first input height = [%d], second " + "input height = [%d]", + in1_height, input2->height())); } // concat rows std::vector in2_rows; @@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor { } auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ( + in1_height, in2_dims[0], + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, in2_dims[0])); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + PADDLE_ENFORCE_EQ( + in1_row_numel, input2->numel() / in1_height, + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, input2->numel() / in1_height)); auto* in1_data = in1_value.data(); auto* input2_data = input2->data(); @@ -302,10 +367,12 @@ struct MergeAdd { continue; } PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], - "all input should have same " - "dimension except for the first one"); + platform::errors::InvalidArgument( + "All inputs should have same " + "dimension except for the first one.")); PADDLE_ENFORCE_EQ(input_height, input->height(), - "all input should have same height"); + platform::errors::InvalidArgument( + "All inputs should have same height.")); row_num += input->rows().size(); merged_row_set.insert(input->rows().begin(), input->rows().end()); } @@ -421,10 +488,12 @@ struct MergeAverage { continue; } PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], - "all input should have same " - "dimension except for the first one"); + platform::errors::InvalidArgument( + "All inputs should have same " + "dimension except for the first one.")); PADDLE_ENFORCE_EQ(input_height, input->height(), - "all input should have same height"); + platform::errors::InvalidArgument( + "All input should have same height.")); row_num += input->rows().size(); merged_row_set.insert(input->rows().begin(), input->rows().end()); } @@ -492,13 +561,23 @@ struct UpdateToTensor { framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ( + in1_height, in2_dims[0], + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, in2_dims[0])); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + PADDLE_ENFORCE_EQ( + in1_row_numel, input2->numel() / in1_height, + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, input2->numel() / in1_height)); auto* in1_data = in1_value.data(); auto* input2_data = input2->data(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 9cce52c6d4587baf01ba22eebc9c57da04c26590..35bd02ad35b71eb7deb3299490fa545ef8b23dc6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -30,7 +30,12 @@ struct SelectedRowsAdd { const framework::SelectedRows& input2, framework::SelectedRows* output) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2.height()); + PADDLE_ENFORCE_EQ( + in1_height, input2.height(), + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, input2.height())); output->set_height(in1_height); framework::Vector in1_rows(input1.rows()); @@ -48,18 +53,34 @@ struct SelectedRowsAdd { auto& in2_value = input2.value(); auto in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); - PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + PADDLE_ENFORCE_EQ( + in1_row_numel, in2_value.numel() / in2_rows.size(), + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, in2_value.numel() / in2_rows.size())); + PADDLE_ENFORCE_EQ( + in1_row_numel, out_value->numel() / out_rows.size(), + platform::errors::InvalidArgument( + "The input and oupput width must be equal." + "But recieved input width = [%d], output width = [%d]", + in1_row_numel, out_value->numel() / out_rows.size())); auto* out_data = out_value->data(); auto* in1_data = in1_value.data(); auto in1_place = input1.place(); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the GPU place.")); auto in2_place = input2.place(); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the GPU place.")); auto out_place = context.GetPlace(); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the GPU place.")); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data, BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data, @@ -104,15 +125,35 @@ struct SelectedRowsAddTensor { auto in1_height = input1.height(); auto in2_dims = input2.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); - PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + PADDLE_ENFORCE_EQ( + in1_height, in2_dims[0], + platform::errors::InvalidArgument( + "The two inputs height must be equal." + "But recieved first input height = [%d], first input height = [%d]", + in1_height, in2_dims[0])); + PADDLE_ENFORCE_EQ( + in1_height, out_dims[0], + platform::errors::InvalidArgument( + "The input and output height must be equal." + "But recieved input height = [%d], output height = [%d]", + in1_height, out_dims[0])); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); - PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + PADDLE_ENFORCE_EQ( + in1_row_numel, input2.numel() / in1_height, + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, input2.numel() / in1_height)); + PADDLE_ENFORCE_EQ( + in1_row_numel, output->numel() / in1_height, + platform::errors::InvalidArgument( + "The input and output width must be equal." + "But recieved input width = [%d], output width = [%d]", + in1_row_numel, output->numel() / in1_height)); auto* in1_data = in1_value.data(); auto* in2_data = input2.data(); @@ -148,7 +189,12 @@ struct SelectedRowsAddTo { const int64_t input2_offset, framework::SelectedRows* input2) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + PADDLE_ENFORCE_EQ( + in1_height, input2->height(), + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, input2->height())); auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); @@ -162,9 +208,13 @@ struct SelectedRowsAddTo { } auto in1_place = input1.place(); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the GPU place.")); auto in2_place = input2->place(); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true, + platform::errors::InvalidArgument( + "The running enviroment is not on the GPU place.")); auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); @@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor { framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ( + in1_height, in2_dims[0], + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, in2_dims[0])); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + PADDLE_ENFORCE_EQ( + in1_row_numel, input2->numel() / in1_height, + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, input2->numel() / in1_height)); auto* in1_data = in1_value.data(); auto* in2_data = input2->data(); @@ -340,10 +400,12 @@ struct MergeAdd { continue; } PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], - "all input should have same " - "dimension except for the first one"); + platform::errors::InvalidArgument( + "All input should have same " + "dimension except for the first one.")); PADDLE_ENFORCE_EQ(input_height, input->height(), - "all input should have same height"); + platform::errors::InvalidArgument( + "All input should have same height.")); merged_row_set.insert(input->rows().begin(), input->rows().end()); } std::vector merge_rows_cpu(merged_row_set.begin(), @@ -448,13 +510,23 @@ struct UpdateToTensor { auto in1_height = merged_in1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ( + in1_height, in2_dims[0], + platform::errors::InvalidArgument("The two inputs height must be equal." + "But recieved first input height = " + "[%d], second input height = [%d]", + in1_height, in2_dims[0])); auto& in1_value = merged_in1.value(); auto& in1_rows = merged_in1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + PADDLE_ENFORCE_EQ( + in1_row_numel, input2->numel() / in1_height, + platform::errors::InvalidArgument( + "The two inputs width must be equal." + "But recieved first input width = [%d], second input width = [%d]", + in1_row_numel, input2->numel() / in1_height)); auto* in1_data = in1_value.template data(); auto* in2_data = input2->data(); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 74892316e6decdeab3a08396fa2f4bdeb8eb7b73..81ad620466ee3d9fcd9d3e057cfd0dd9053089f0 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) { {static_cast(rows1.size()), row_numel}), gpu_place); functor(ctx, in1_value, 1.0); - PADDLE_ENFORCE(cudaDeviceSynchronize()); + PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0, + paddle::platform::errors::PreconditionNotMet( + "The all synchronization on the cuda is error!")); std::vector rows2{0, 5, 7, 9}; std::unique_ptr selected_rows2{ diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 01f50727b442579fa62059560d0c75d329d6e288..c05da0062f2bab66746feb9d8ebedeca0c0f9688 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -34,10 +34,16 @@ class Vol2ColFunctor { const std::vector& strides, const std::vector& paddings, framework::Tensor* col, const DataLayout data_layout) const { - PADDLE_ENFORCE_EQ(vol.dims().size(), 4, - "The dimension of vol should be 4."); - PADDLE_ENFORCE_EQ(col->dims().size(), 7, - "The dimension of col should be 7."); + PADDLE_ENFORCE_EQ( + vol.dims().size(), 4, + platform::errors::InvalidArgument("The dimension of" + " vol should be 4, but received %d.", + vol.dims().size())); + PADDLE_ENFORCE_EQ( + col->dims().size(), 7, + platform::errors::InvalidArgument("The dimension of" + "col should be 7, but received %d.", + col->dims().size())); int input_channels = (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); @@ -65,27 +71,33 @@ class Vol2ColFunctor { int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; - PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); const T* vol_data = vol.data(); T* col_data = col->data(); @@ -140,10 +152,16 @@ class Col2VolFunctor { const std::vector& strides, const std::vector& paddings, framework::Tensor* vol, const DataLayout data_layout) const { - PADDLE_ENFORCE_EQ(vol->dims().size(), 4, - "The dimension of vol should be 4."); - PADDLE_ENFORCE_EQ(col.dims().size(), 7, - "The dimension of col should be 7."); + PADDLE_ENFORCE_EQ( + vol->dims().size(), 4, + platform::errors::InvalidArgument("The dimension of vol" + " should be 4, but received %d.", + vol->dims().size())); + PADDLE_ENFORCE_EQ( + col.dims().size(), 7, + platform::errors::InvalidArgument("The dimension of col" + " should be 7, but received %d.", + col.dims().size())); int input_channels = (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); @@ -170,27 +188,33 @@ class Col2VolFunctor { int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; - PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d)" + " and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ(input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d)" + " and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ(input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d)" + " and output_width(%d) are mismatching.", + input_width_tmp, output_width)); T* vol_data = vol->data(); const T* col_data = col.data(); diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index 9de9051f512348f2567bfc35ae775b1852ed25fc..fe5a600909893b8313d470923ef4d43eae155e76 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -90,10 +90,16 @@ class Vol2ColFunctor { const std::vector& strides, const std::vector& paddings, framework::Tensor* col, const DataLayout data_layout) const { - PADDLE_ENFORCE_EQ(vol.dims().size(), 4, - "The dimension of vol should be 4."); - PADDLE_ENFORCE_EQ(col->dims().size(), 7, - "The dimension of col should be 7."); + PADDLE_ENFORCE_EQ( + vol.dims().size(), 4, + platform::errors::InvalidArgument("The dimension of" + " vol should be 4, but received %d.", + vol.dims().size())); + PADDLE_ENFORCE_EQ( + col->dims().size(), 7, + platform::errors::InvalidArgument("The dimension of" + "col should be 7, but received %d.", + col->dims().size())); int input_channels = (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); @@ -117,27 +123,33 @@ class Vol2ColFunctor { int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; - PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); int num_outputs = input_channels * output_depth * output_height * output_width; @@ -241,10 +253,16 @@ class Col2VolFunctor { const std::vector& strides, const std::vector& paddings, framework::Tensor* vol, const DataLayout data_layout) const { - PADDLE_ENFORCE_EQ(vol->dims().size(), 4, - "The dimension of vol should be 4."); - PADDLE_ENFORCE_EQ(col.dims().size(), 7, - "The dimension of col should be 7."); + PADDLE_ENFORCE_EQ( + vol->dims().size(), 4, + platform::errors::InvalidArgument("The dimension of vol" + " should be 4, but received %d.", + vol->dims().size())); + PADDLE_ENFORCE_EQ( + col.dims().size(), 7, + platform::errors::InvalidArgument("The dimension of col" + " should be 7, but received %d.", + col.dims().size())); int input_channels = (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); @@ -269,27 +287,33 @@ class Col2VolFunctor { int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; - PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d)" + " and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ(input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d)" + " and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ(input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d)" + " and output_width(%d) are mismatching.", + input_width_tmp, output_width)); int num_kernels = input_channels * input_depth * input_height * input_width; diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index 98200caca8cf66960632b88966f23e99fcd4c299..51fa5ad021a2b284cd75f297d83326b2102c1e41 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { float std = context.Attr("std"); auto* tensor = context.Output("Out"); - const std::string op_type = "gaussian_random"; - auto shape = GetShape(context, op_type); + auto shape = GetShape(context); tensor->Resize(shape); T* data = tensor->mutable_data(context.GetPlace()); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc old mode 100644 new mode 100755 index 5f0500d2faa77f7c2e901c0d30ab2c42036d2a86..479f9643749d63c673158ad055409a0925f3d576 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("lars_weight_decay", "(float, default 0.0005) LARS weight decay") .SetDefault(0.0005); + AddAttr("epsilon", + "(float, default 0.0) epsilon to avoid Division by Zero.") + .SetDefault(0.0); AddComment(R"DOC( Lars Momentum Optimizer. diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 1dace4ed6ab3e17b348035e34f6d9ea6d31edae9..eb0111ae4de2f066359e26406f6c7ec3eb54d5fc 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, const T* learning_rate, const T mu, const int64_t num, const T lars_coeff, const T lars_weight_decay, const T* p_norm, - const T* g_norm, T* p_out, T* v_out) { + const T* g_norm, T* p_out, T* v_out, + const T epsilon) { T lr = learning_rate[0]; T local_lr = learning_rate[0]; CUDA_KERNEL_LOOP(i, num) { - if (p_norm[0] > 0 && g_norm[0] > 0) { + if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) { local_lr = lr * lars_coeff * p_norm[0] / - (g_norm[0] + lars_weight_decay * p_norm[0]); + (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon); } + T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]); v_out[i] = v_new; p_out[i] = p[i] - v_new; @@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { T mu = static_cast(ctx.Attr("mu")); T lars_coeff = ctx.Attr("lars_coeff"); T lars_weight_decay = ctx.Attr("lars_weight_decay"); + T epsilon = ctx.Attr("epsilon"); auto* p = param->data(); auto* v = velocity->data(); @@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { eg_norm.device(*place) = eigen_g.square().sum().sqrt(); MomentumLarsKernel<<>>( p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, - p_norm_data, g_norm_data, p_out, v_out); + p_norm_data, g_norm_data, p_out, v_out, epsilon); } }; diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h old mode 100644 new mode 100755 index e0064c201825b1f074eb53c591dc3abdd7bc1e1b..b579b5143ddbe6221738f9864f13fb7bea4ac509 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.h +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h @@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel { T mu = static_cast(ctx.Attr("mu")); T lars_coeff = ctx.Attr("lars_coeff"); T lars_weight_decay = ctx.Attr("lars_weight_decay"); + T epsilon = ctx.Attr("epsilon"); auto p_out = framework::EigenVector::Flatten(*param_out); auto v_out = framework::EigenVector::Flatten(*velocity_out); @@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel { ep_norm = p.square().sum().sqrt(); eg_norm = g.square().sum().sqrt(); T local_lr = lr[0]; - if (ep_norm(0) > 0 && eg_norm(0) > 0) { + if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) { local_lr = lr[0] * lars_coeff * ep_norm(0) / - (eg_norm(0) + lars_weight_decay * ep_norm(0)); + (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon); } v_out = v * mu + local_lr * (g + lars_weight_decay * p); p_out = p - v_out; diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 3da481a142aa2282aade661de7679cf4edf597a0..a68666b100cb52c722c4fefc849e94947130010f 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,6 +1,10 @@ include(operators) if(WITH_GPU) - register_operators(DEPS cub) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + register_operators(DEPS cub) + else() + register_operators() + endif() else() register_operators() endif() @@ -24,5 +28,9 @@ if(WITH_GPU) endif() if(WITH_GPU) - nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub) + else() + nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor) + endif() endif() diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc index 322a1637f5deec909db13f1bd0433446cd7606ae..7cd164bfd3a3d77288b59c40f147ae9cdd8215e0 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc @@ -13,18 +13,138 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" -#include +#include #include -#include #include namespace paddle { namespace operators { -class LogsumexpOpMaker : public ops::ReduceOpMaker { - protected: - virtual std::string GetName() const { return "logsumexp"; } - virtual std::string GetOpType() const { return "Reduce logsumexp"; } +class LogsumexpOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp"); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 4, + platform::errors::InvalidArgument( + "The input tensor X's dimensions of logsumexp " + "should be less equal than 4. But received X's " + "dimensions = %d, X's shape = [%s].", + x_rank, x_dims)); + auto axis = ctx->Attrs().Get>("axis"); + PADDLE_ENFORCE_GT( + axis.size(), 0, + platform::errors::InvalidArgument( + "The size of axis of logsumexp " + "should be greater than 0. But received the size of axis " + "of logsumexp is %d.", + axis.size())); + + for (size_t i = 0; i < axis.size(); i++) { + PADDLE_ENFORCE_LT( + axis[i], x_rank, + platform::errors::InvalidArgument( + "axis[%d] should be in the " + "range [-dimension(X), dimension(X)] " + "where dimesion(X) is %d. But received axis[i] = %d.", + i, x_rank, axis[i])); + PADDLE_ENFORCE_GE( + axis[i], -x_rank, + platform::errors::InvalidArgument( + "axis[%d] should be in the " + "range [-dimension(X), dimension(X)] " + "where dimesion(X) is %d. But received axis[i] = %d.", + i, x_rank, axis[i])); + if (axis[i] < 0) { + axis[i] += x_rank; + } + } + + bool keepdim = ctx->Attrs().Get("keepdim"); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + auto dims_vector = vectorize(x_dims); + if (reduce_all) { + if (keepdim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keepdim) { + for (size_t i = 0; i < axis.size(); ++i) { + dims_vector[axis[i]] = 1; + } + } else { + const int kDelFlag = -1; + for (size_t i = 0; i < axis.size(); ++i) { + dims_vector[axis[i]] = kDelFlag; + } + dims_vector.erase( + std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + } + if (!keepdim && dims_vector.size() == 0) { + dims_vector.push_back(1); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (axis.size() > 0 && axis[0] != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 4 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr>( + "axis", + "(list, default {0}) The dimensions to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault({0}); + AddAttr("keepdim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + AddComment(string::Sprintf(R"DOC( +logsumexp Operator. + +This operator computes the logsumexp of input tensor along the given axis. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. + +)DOC")); + } +}; + +class LogsumexpGrapOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp"); + OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "logsumexp"); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } }; template @@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker { public: using framework::SingleGradOpMaker::SingleGradOpMaker; - protected: void Apply(GradOpPtr op) const override { op->SetType("logsumexp_grad"); op->SetInput("X", this->Input("X")); @@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle -REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker, +namespace ops = paddle::operators; + +REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker, ops::LogsumexpGradOpMaker, ops::LogsumexpGradOpMaker); -REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp); +REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp); -REGISTER_OP_CPU_KERNEL(logsumexp, - ops::ReduceKernel, - ops::ReduceKernel); REGISTER_OP_CPU_KERNEL( - logsumexp_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel); + logsumexp, ops::LogsumexpKernel, + ops::LogsumexpKernel); +REGISTER_OP_CPU_KERNEL( + logsumexp_grad, + ops::LogsumexpGradKernel, + ops::LogsumexpGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu index c9ad1075c0c3c1c6f405144dbfde2e81b85124aa..86a31595ebaabcbc07fab64779c33566d5b020eb 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" -REGISTER_OP_CUDA_KERNEL(logsumexp, - ops::ReduceKernel, - ops::ReduceKernel); +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + logsumexp, ops::LogsumexpKernel, + ops::LogsumexpKernel); diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h index 1d0e00262a37ff7160abd7a865e63377f8b30461..a478690976bd396db921b465d171a422451e0742 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h @@ -14,11 +14,20 @@ #pragma once -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include +#include +#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" namespace paddle { namespace operators { +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor( \ + context.template device_context(), *input, output, \ + axis, keepdim); \ + } + struct LogsumexpFunctor { template void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { @@ -54,5 +63,106 @@ struct LogsumexpGradFunctor { } }; +template +class LogsumexpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto axis = context.Attr>("axis"); + auto keepdim = context.Attr("keepdim"); + auto reduce_all = context.Attr("reduce_all"); + + const auto& input_dim_size = input->dims().size(); + // The dims has full dim, set the reduce_all is True + reduce_all |= (static_cast(axis.size()) == input_dim_size); + + if (reduce_all) { + // Flatten and reduce 1-D tensor + auto x = EigenVector::Flatten(*input); + auto out = EigenScalar::From(*output); + auto& place = + *context.template device_context().eigen_device(); + auto reduce_dim = Eigen::array({{0}}); + LogsumexpFunctor()(place, &x, &out, reduce_dim); + } else { + int ndim = input_dim_size; + int rdim = axis.size(); + // comments for accelerating compiling temporarily. + // HANDLE_DIM(6, 5); + // HANDLE_DIM(6, 4); + // HANDLE_DIM(6, 3); + // HANDLE_DIM(6, 2); + // HANDLE_DIM(6, 1); + // HANDLE_DIM(5, 4); + // HANDLE_DIM(5, 3); + // HANDLE_DIM(5, 2); + // HANDLE_DIM(5, 1); + HANDLE_DIM(4, 3); + HANDLE_DIM(4, 2); + HANDLE_DIM(4, 1); + HANDLE_DIM(3, 2); + HANDLE_DIM(3, 1); + HANDLE_DIM(2, 1); + } + } +}; + +template +class LogsumexpGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Input("Out"); + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* input_grad = context.Output(framework::GradVarName("X")); + input_grad->mutable_data(context.GetPlace()); + + auto axis = context.Attr>("axis"); + auto reduce_all = context.Attr("reduce_all"); + const auto input_dim_size = context.Input("X")->dims().size(); + reduce_all |= (static_cast(axis.size()) == input_dim_size); + + if (reduce_all) { + auto x = EigenVector::Flatten(*input); + auto y = EigenVector::Flatten(*output); + auto dy = EigenVector::Flatten(*output_grad); + auto dx = EigenVector::Flatten(*input_grad); + auto& place = + *context.template device_context().eigen_device(); + auto broadcast_dim = + Eigen::array({{static_cast(input->numel())}}); + LogsumexpGradFunctor()(place, &x, &y, &dx, &dy, broadcast_dim, + broadcast_dim[0]); + } else { + int rank = input->dims().size(); + switch (rank) { + case 1: + ReduceGradFunctor( + context.template device_context(), *input, *output, + *output_grad, input_grad, axis); + break; + case 2: + ReduceGradFunctor( + context.template device_context(), *input, *output, + *output_grad, input_grad, axis); + break; + case 3: + ReduceGradFunctor( + context.template device_context(), *input, *output, + *output_grad, input_grad, axis); + break; + case 4: + ReduceGradFunctor( + context.template device_context(), *input, *output, + *output_grad, input_grad, axis); + break; + } + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu index d6ad4863092a50233b806c944db0b8c161ed9dd0..81124e4f070a54444f4305dc903280548ac10b60 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu @@ -15,8 +15,9 @@ // .part used to speed up nvcc compile #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" +namespace ops = paddle::operators; + REGISTER_OP_CUDA_KERNEL( - logsumexp_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel); + logsumexp_grad, + ops::LogsumexpGradKernel, + ops::LogsumexpGradKernel); diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc index 04559a93c866c72f2d0b309a5005557134355666..2d599716443901053aa3d5dc8e93759320175b24 100644 --- a/paddle/fluid/operators/run_program_op.cc +++ b/paddle/fluid/operators/run_program_op.cc @@ -27,9 +27,6 @@ class RunProgramOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true, platform::errors::NotFound( "Input(X) of RunProgramOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true, - platform::errors::NotFound( - "Input(Params) of RunProgramOp should not be null.")); PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true, platform::errors::NotFound( "Output(Out) of RunProgramOp should not be null.")); @@ -73,7 +70,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker { "(vector)" "The input parameter of RunProgram operator, also the parameters " "of the loaded program.") - .AsDuplicable(); + .AsDuplicable() + .AsDispensable(); AddOutput("Out", "(vector)" "The output tensors of RunProgram operator, also the fetch " @@ -121,10 +119,6 @@ class RunProgramGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true, platform::errors::NotFound( "Input(X) of RunProgramGradOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInputs("Params"), true, - platform::errors::NotFound( - "Input(Params) of RunProgramGradOp should not be null.")); PADDLE_ENFORCE_EQ( ctx->HasInputs(framework::GradVarName("Out")), true, platform::errors::NotFound( diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index 1c493fc6be093a2af8f58c8e78d1be43de34306f..5afe25cf687fc96d1eaac33b2d0516c96c394a46 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel { auto output_vars = ctx.MultiOutputVar("Out"); auto input_var_names = ctx.InputNames("X"); - auto param_names = ctx.InputNames("Params"); auto output_var_names = ctx.OutputNames("Out"); + // current program may not hold parameters + std::vector param_names; + if (!param_vars.empty()) { + param_names = ctx.InputNames("Params"); + } + auto *block = ctx.Attr("global_block"); auto *program = block->Program(); auto start_op_index = ctx.Attr("start_op_index"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index cc6ee7b19ea99fe61ef00beaf475fc35c8a0a809..9cfe47da5db7ba15c9b24a8d551606f805ad9b15 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -208,8 +208,11 @@ class TensorRTEngineOp : public framework::OperatorBase { auto stream = reinterpret_cast(dev_ctx).stream(); - PADDLE_ENFORCE_EQ(input_names_.empty(), false, - "should pass at least one input"); + PADDLE_ENFORCE_EQ( + input_names_.empty(), false, + platform::errors::PreconditionNotMet( + "TensorRT engine needs at least one input, but no input is found. " + "Please check if you set the input correctly.")); std::vector output_maps = Attr>("output_name_mapping"); @@ -295,12 +298,19 @@ class TensorRTEngineOp : public framework::OperatorBase { #endif } auto *fluid_v = scope.FindVar(y); - PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); + PADDLE_ENFORCE_NOT_NULL( + fluid_v, + platform::errors::NotFound( + "Output variable %s is not found in TensorRT subgraph.", y)); auto *fluid_t = fluid_v->GetMutable(); fluid_t->Resize(framework::make_ddim(ddim)); - PADDLE_ENFORCE(bind_index < num_bindings, - "The bind index should be less than num_bindings"); + PADDLE_ENFORCE_LT(bind_index, num_bindings, + platform::errors::InvalidArgument( + "The binding index in TRT engine should be less " + "than the number of bindings, but got binding " + "index = %d, number of bindings = %d.", + bind_index, num_bindings)); buffers[bind_index] = static_cast(fluid_t->mutable_data( BOOST_GET_CONST(platform::CUDAPlace, dev_place))); diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index da4ca87296d92fc1052f462ae6ee8a3acb05eb49..bc1cb3b4aa1c1bdd0a9be39a4e113301d65ce5b5 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -241,6 +241,26 @@ class TileGradOpMaker : public framework::SingleGradOpMaker { } }; +template +class TileDoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tile"); + op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + if (this->HasInput("repeat_times_tensor")) { + op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor")); + } + if (this->HasInput("RepeatTimes")) { + op->SetInput("RepeatTimes", this->Input("RepeatTimes")); + } + op->SetAttrMap(this->Attrs()); + } +}; + DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X"); } // namespace operators @@ -251,6 +271,8 @@ REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker, ops::TileGradOpMaker, ops::TileGradOpMaker); REGISTER_OPERATOR(tile_grad, ops::TileGradOp, + ops::TileDoubleGradOpMaker, + ops::TileDoubleGradOpMaker, ops::TileGradNoNeedBufVarsInferer); REGISTER_OP_CPU_KERNEL( tile, ops::TileKernel, diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h index e53981a53653a4830a39ceae47f4024bb757b039..aec995304a77118ecbf788ca3984c7e9da531f18 100644 --- a/paddle/fluid/operators/utils.h +++ b/paddle/fluid/operators/utils.h @@ -81,5 +81,26 @@ inline std::vector GetDataFromTensorList( } return vec_new_data; } + +inline framework::DDim GetShape(const framework::ExecutionContext& ctx) { + // 1. shape is a Tensor + if (ctx.HasInput("ShapeTensor")) { + auto* shape_tensor = ctx.Input("ShapeTensor"); + auto vec_shape = GetDataFromTensor(shape_tensor); + return framework::make_ddim(vec_shape); + } + + // 2. shape is a list/tuple containing Tensor + auto shape_tensor_list = ctx.MultiInput("ShapeTensorList"); + if (shape_tensor_list.size() > 0) { + auto vec_shape = GetDataFromTensorList(shape_tensor_list); + return framework::make_ddim(vec_shape); + } + + // 3. shape is a list/tuple without containing Tensor + auto vec_shape = ctx.Attr>("shape"); + return framework::make_ddim(vec_shape); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 178ecaff7e8d2e575cd64927fe4e39c773b2cb99..f751136640caad6acd3230bc22cd0e3f0fafe9fb 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -111,7 +111,9 @@ std::map> op_passing_outs_map = { {"fake_quantize_dequantize_moving_average_abs_max", {"Out", "OutScale", "OutAccum", "OutState"}}, {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}}, - {"amp_check_finite_and_scale", {"Out", "FoundInfinite"}}, + {"check_finite_and_unscale", {"Out", "FoundInfinite"}}, + {"update_loss_scaling", + {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, }; // clang-format off diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt index 7eab677fac1683fdc95c9e338b1099d78b5cabc3..235d92ac4f9e88947cea04425b0916b8a0290979 100644 --- a/paddle/fluid/train/CMakeLists.txt +++ b/paddle/fluid/train/CMakeLists.txt @@ -27,8 +27,6 @@ function(train_test TARGET_NAME) endif() set_tests_properties(test_train_${TARGET_NAME}${arg} PROPERTIES DEPENDS test_${TARGET_NAME}) - set_tests_properties(test_train_${TARGET_NAME}${arg} - PROPERTIES LABELS "RUN_TYPE=DIST") if(NOT WIN32 AND NOT APPLE) set_tests_properties(test_train_${TARGET_NAME}${arg} PROPERTIES TIMEOUT 150) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 1616e237092037303073236809a16a72651568cd..15610abef0f2d07eeb02e37bb0d4cbf394c94d90 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -154,6 +154,7 @@ echo Step 2. Buile Paddle ... echo ======================================== call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 +for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*8/10 set build_times=1 :build_tp echo Build third_party the %build_times% time: @@ -172,7 +173,7 @@ echo Build third_party successfully! set build_times=1 :build_paddle echo Build Paddle the %build_times% time: -msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln +msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 if %build_times% GTR 2 ( diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 88d9e6e55d577dbb5b883d1c59e2c58d54373742..ec07565c5af6c7ba79c15d9a335313775719c682 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -296,13 +296,13 @@ function check_style() { commit_files=on for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do if ! pre-commit run --files $file_name ; then - git diff commit_files=off fi done if [ $commit_files == 'off' ];then echo "code format error" + git diff 2>&1 exit 4 fi trap : 0 @@ -1447,7 +1447,7 @@ function example() { cd ${PADDLE_ROOT}/tools python sampcd_processor.py cpu;example_error=$? if [ "$example_error" != "0" ];then - echo "Code instance execution failed" + echo "Code instance execution failed" >&2 exit 5 fi } @@ -1456,15 +1456,25 @@ function summary_check_problems() { set +x local check_style_code=$1 local example_code=$2 + local check_style_info=$3 + local example_info=$4 if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then echo "========================================" echo "summary problems:" + if [ $check_style_code -ne 0 -a $example_code -ne 0 ];then + echo "There are 2 errors: Code format error and Example code error." + else + [ $check_style_code -ne 0 ] && echo "There is 1 error: Code format error." + [ $example_code -ne 0 ] && echo "There is 1 error: Example code error." + fi echo "========================================" if [ $check_style_code -ne 0 ];then - echo "- Check code style failed! Please check the log and fix problems." + echo "*****Code format error***** Please fix it according to the diff information:" + echo "$check_style_info" | grep "code format error" -A $(echo "$check_style_info" | wc -l) fi if [ $example_code -ne 0 ];then - echo "- Check example code failed! Please check the log and fix problems." + echo "*****Example code error***** Please fix the error listed in the information:" + echo "$example_info" | grep "API check -- Example Code" -A $(echo "$example_info" | wc -l) fi [ $check_style_code -ne 0 ] && exit $check_style_code [ $example_code -ne 0 ] && exit $example_code @@ -1486,15 +1496,16 @@ function main() { ;; build_and_check) set +e - $(check_style >&2) + check_style_info=$(check_style) check_style_code=$? generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} check_sequence_op_unittest generate_api_spec ${PYTHON_ABI:-""} "PR" - $(example >&2) + set +e + example_info=$(example) example_code=$? - summary_check_problems $check_style_code $example_code + summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info" assert_api_spec_approvals ;; build) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index d5793eb424ab794e3e8af8ef2312aac927c272e5..ed0b415d0bfd86b5160d339a286cfddac37cf4df 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -75,6 +75,7 @@ from .tensor.creation import full_like #DEFINE_ALIAS from .tensor.creation import triu #DEFINE_ALIAS from .tensor.creation import tril #DEFINE_ALIAS from .tensor.creation import meshgrid #DEFINE_ALIAS +from .tensor.creation import empty #DEFINE_ALIAS from .tensor.linalg import matmul #DEFINE_ALIAS from .tensor.linalg import dot #DEFINE_ALIAS # from .tensor.linalg import einsum #DEFINE_ALIAS diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 62967a202ab53e8a5dc835900280259508bb640d..1b86056c00443be4170757cee3cc60bbafd0f40b 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -17,6 +17,7 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2 from paddle.fluid.framework import Variable, set_flags, core from paddle.fluid.wrapped_decorator import wrap_decorator import google.protobuf.text_format +import google.protobuf __all__ = ["DistributedStrategy"] @@ -706,11 +707,7 @@ class DistributedStrategy(object): **Notes**: k_steps(int) The local steps for training before parameter synchronization. Default 1. - - If strategy.auto is set True, the local steps will be calculated automatically during training. - The algorithm is referenced in this paper: - `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD `_. - In this case, k_steps indicates the first local steps which is suggested setting to 1. + begin_step(int) The step of begining training by localsgd. Default 1. Examples: .. code-block:: python @@ -718,7 +715,8 @@ class DistributedStrategy(object): import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.localsgd = True - strategy.localsgd_configs = {"k_steps": 4} + strategy.localsgd_configs = {"k_steps": 4, + "begin_step": 30} """ return get_msg_dict(self.strategy.localsgd_configs) @@ -1133,7 +1131,91 @@ class DistributedStrategy(object): return False def __repr__(self): + spacing = 2 + max_k = 38 + max_v = 38 + + length = max_k + max_v + spacing + + h1_format = " " + "|{{:^{}s}}|\n".format(length) + h2_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * + spacing, max_v) + + border = " +" + "".join(["="] * length) + "+" + line = " +" + "".join(["-"] * length) + "+" + + draws = border + "\n" + draws += h1_format.format("") + draws += h1_format.format("DistributedStrategy Overview") + draws += h1_format.format("") + fields = self.strategy.DESCRIPTOR.fields + str_res = "" + + env_draws = line + "\n" for f in fields: - print("{}: {}".format(f.name, f.default_value)) - return str(self.strategy) + if "build_strategy" in f.name or "execution_strategy" in f.name: + continue + if "_configs" in f.name: + continue + else: + if isinstance(getattr(self.strategy, f.name), bool): + if hasattr(self.strategy, f.name + "_configs"): + if getattr(self.strategy, f.name): + draws += border + "\n" + draws += h1_format.format( + "{} = True, please check {}_configs".format( + f.name, f.name)) + draws += line + "\n" + my_configs = getattr(self.strategy, + f.name + "_configs") + config_fields = my_configs.DESCRIPTOR.fields + for ff in config_fields: + if isinstance( + getattr(my_configs, ff.name), + google.protobuf.pyext._message. + RepeatedScalarContainer): + values = getattr(my_configs, ff.name) + for i, v in enumerate(values): + if i == 0: + draws += h2_format.format(ff.name, + str(v)) + else: + draws += h2_format.format("", + str(v)) + else: + draws += h2_format.format( + ff.name, + str(getattr(my_configs, ff.name))) + else: + env_draws += h2_format.format( + f.name, str(getattr(self.strategy, f.name))) + else: + env_draws += h2_format.format( + f.name, str(getattr(self.strategy, f.name))) + + result_res = draws + border + "\n" + h1_format.format( + "Environment Flags, Communication Flags") + result_res += env_draws + + build_strategy_str = border + "\n" + build_strategy_str += h1_format.format("Build Strategy") + build_strategy_str += line + "\n" + + fields = self.strategy.build_strategy.DESCRIPTOR.fields + for f in fields: + build_strategy_str += h2_format.format( + f.name, str(getattr(self.strategy.build_strategy, f.name))) + build_strategy_str += border + "\n" + + execution_strategy_str = h1_format.format("Execution Strategy") + execution_strategy_str += line + "\n" + + fields = self.strategy.execution_strategy.DESCRIPTOR.fields + for f in fields: + execution_strategy_str += h2_format.format( + f.name, str(getattr(self.strategy.execution_strategy, f.name))) + execution_strategy_str += border + "\n" + + result_res += build_strategy_str + execution_strategy_str + return result_res diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index b9189492694f3a628843156cb329a43787e64ad2..0dfcd5f3255efa945bbd4ac94b00433960eeaa22 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -231,7 +231,7 @@ class Fleet(object): Returns: int: worker numbers - + Examples: .. code-block:: python @@ -737,7 +737,7 @@ class Fleet(object): """ Set the value of the learning rate manually in the optimizer. Only work in dygraph mode - + Args: value (float|Tensor): the value of learning rate @@ -877,7 +877,7 @@ class Fleet(object): """ Execute the optimizer once. Only work in dygraph mode - + Returns: None Examples: @@ -1019,7 +1019,7 @@ class Fleet(object): if self.user_defined_strategy._is_strict_auto(): # turn on all the strategy for each optimizer for opt in distributed_optimizer_list: - opt._enable_strategy(self.user_defined_strategy) + opt._enable_strategy(self.user_defined_strategy, context) valid_optimizer_list = [] valid_graph_optimizer_list = [] diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 3da5aed8201ace6ccf9eed1ff322a7c6304de4a6..0e995200dde035842d89d9c503566b7b70ee67b7 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -347,12 +347,13 @@ def pretty_print_envs(envs, header=None): for k, v in envs.items(): max_k = max(max_k, len(k)) - h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v) - l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v) + h_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * spacing, + max_v) + l_format = " " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v) length = max_k + max_v + spacing - border = "".join(["="] * length) - line = "".join(["-"] * length) + border = " +" + "".join(["="] * length) + "+" + line = " +" + "".join(["-"] * length) + "+" draws = "" draws += border + "\n" diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index 938bd258847e72e43044f2e4f5550a86e064eae5..31a9913701c3e08f5268d578d09c15f5bf8a86f8 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -34,6 +34,9 @@ class AMPOptimizer(MetaOptimizerBase): loss, role_maker, user_defined_optimizer, user_defined_strategy) def _can_apply(self): + if not self.role_maker._is_collective: + return False + if self.user_defined_strategy.amp: return True return False @@ -42,7 +45,7 @@ class AMPOptimizer(MetaOptimizerBase): dist_strategy.amp = False dist_strategy.amp_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): dist_strategy.amp = True dist_strategy.amp_configs = { "init_loss_scaling": 32768.0, diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index d292f58456c3ad91d8ef2e2ddc4770b50d71cdfd..3f6ed1ed2f23d4595b3aadff6f259f9e27f129b2 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -53,6 +53,9 @@ class DGCOptimizer(MetaOptimizerBase): name=opt._name) def _can_apply(self): + if not self.role_maker._is_collective: + return False + if self.user_defined_strategy.dgc: if not isinstance(self.inner_opt, Momentum): logging.warn("dgc only works on Momentum optimizer") @@ -69,7 +72,7 @@ class DGCOptimizer(MetaOptimizerBase): dist_strategy.dgc = False dist_strategy.dgc_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): dist_strategy.dgc = True dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1} @@ -89,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase): no_grad_set=None): optimize_ops, params_grads = \ self.dgc_opt.minimize(loss, startup_program, - parameter_list, no_grad_set) + parameter_list, no_grad_set) return optimize_ops, params_grads diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py index bb0c631e081971461655429c1415ec619a9f9dbc..f1b3680976541806d96ca815be64b03bcd499469 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -37,15 +37,18 @@ class GradientMergeOptimizer(MetaOptimizerBase): self.user_defined_strategy.gradient_merge_configs["avg"]) def _can_apply(self): + if not self.role_maker._is_collective: + return False + can_apply = (self.user_defined_strategy.gradient_merge == True) and \ - self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1 + self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1 return can_apply def _disable_strategy(self, dist_strategy): dist_strategy.gradient_merge = False dist_strategy.gradient_merge_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): # we currently do not support auto-enable gradient merge return diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index 03304f1b68b85f4bdf0452d0ebe88e2f46e2c94e..6c1cc3d7a9769a5c61997ab761a5458b7e8df4a3 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -48,7 +48,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase): callbacks=None): pass - # should fix the variable + # should fix the variable def _setup_nccl_op(self, startup_program, main_program, build_strategy): trainer_endpoints = self.role_maker.get_trainer_endpoints() trainers = trainer_endpoints @@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase): dist_strategy = self.user_defined_strategy local_build_strategy = paddle.fluid.BuildStrategy() local_build_strategy.enable_sequential_execution = \ - dist_strategy.build_strategy.enable_sequential_execution + dist_strategy.build_strategy.enable_sequential_execution local_build_strategy.fuse_elewise_add_act_ops = \ - dist_strategy.build_strategy.fuse_elewise_add_act_ops + dist_strategy.build_strategy.fuse_elewise_add_act_ops local_build_strategy.fuse_bn_act_ops = \ - dist_strategy.build_strategy.fuse_bn_act_ops + dist_strategy.build_strategy.fuse_bn_act_ops local_build_strategy.enable_auto_fusion = \ - dist_strategy.build_strategy.enable_auto_fusion + dist_strategy.build_strategy.enable_auto_fusion local_build_strategy.fuse_relu_depthwise_conv = \ - dist_strategy.build_strategy.fuse_relu_depthwise_conv + dist_strategy.build_strategy.fuse_relu_depthwise_conv local_build_strategy.fuse_broadcast_ops = \ - dist_strategy.build_strategy.fuse_broadcast_ops + dist_strategy.build_strategy.fuse_broadcast_ops local_build_strategy.fuse_all_optimizer_ops = \ - dist_strategy.build_strategy.fuse_all_optimizer_ops + dist_strategy.build_strategy.fuse_all_optimizer_ops local_build_strategy.enable_inplace = \ - dist_strategy.build_strategy.enable_inplace + dist_strategy.build_strategy.enable_inplace local_build_strategy.use_hierarchical_allreduce = \ - dist_strategy.use_hierarchical_allreduce + dist_strategy.use_hierarchical_allreduce local_build_strategy.hierarchical_allreduce_inter_nranks = \ - dist_strategy.hierarchical_allreduce_inter_nranks + dist_strategy.hierarchical_allreduce_inter_nranks local_build_strategy.sync_batch_norm = \ - dist_strategy.sync_batch_norm + dist_strategy.sync_batch_norm local_build_strategy.fuse_all_reduce_ops = \ - dist_strategy.fuse_all_reduce_ops + dist_strategy.fuse_all_reduce_ops local_build_strategy.nccl_comm_num = \ - dist_strategy.nccl_comm_num + dist_strategy.nccl_comm_num if self.user_defined_strategy.recompute == True: logging.warn( @@ -190,7 +190,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase): # TODO(guru4elephant): should close all PE related flags here return - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): # by default, graph execution strategy is enabled return diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index 3a9f2be533b8bc176b2361eaffbc74d4b834749c..df9887759e16fddb0579abdcdf3ef5f9024825e7 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase): name=opt._name) def _can_apply(self): + if not self.role_maker._is_collective: + return False + if self.user_defined_strategy.lamb: if not isinstance(self.inner_opt, AdamOptimizer): logging.warn( @@ -75,7 +78,7 @@ class LambOptimizer(MetaOptimizerBase): dist_strategy.lamb = False dist_strategy.lamb_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): dist_strategy.lamb = True dist_strategy.lamb_configs = { "lamb_weight_decay": 0.01, @@ -91,6 +94,10 @@ class LambOptimizer(MetaOptimizerBase): return self.lamb_opt.backward(loss, startup_program, parameter_list, no_grad_set, callbacks) + # the following function will be used by AMP if both LARS and AMP are turn on together. + def apply_gradients(self, params_grads): + return self.lamb_opt.apply_gradients(params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, @@ -98,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase): no_grad_set=None): optimize_ops, params_grads = \ self.lamb_opt.minimize(loss, startup_program, - parameter_list, no_grad_set) + parameter_list, no_grad_set) return optimize_ops, params_grads diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py index cb12154ddc564687539d953c21b9e0597a8bf893..609d8b85e714c1c7247898f8d506f9dadab9f499 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py @@ -44,13 +44,19 @@ class LarsOptimizer(MetaOptimizerBase): parameter_list=opt._parameter_list, regularization=opt.regularization, grad_clip=opt._grad_clip, - name=opt._name) + name=opt._name, + exclude_from_weight_decay=configs['exclude_from_weight_decay'], + epsilon=configs['epsilon']) def _can_apply(self): + if not self.role_maker._is_collective: + return False + if self.user_defined_strategy.lars: if not isinstance(self.inner_opt, Momentum): logging.warn( - "lars need the inner optimizer to be Momentum optimizer.") + "lars need the inner optimizer to be Momentum optimizer but got {}.". + format(self.inner_opt.type)) return False return True return False @@ -59,7 +65,7 @@ class LarsOptimizer(MetaOptimizerBase): dist_strategy.lars = False dist_strategy.lars_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): dist_strategy.lars = True dist_strategy.lars_configs = { "lars_coeff": 0.01, @@ -75,6 +81,10 @@ class LarsOptimizer(MetaOptimizerBase): return self.lars_opt.backward(loss, startup_program, parameter_list, no_grad_set, callbacks) + # the following function will be used by AMP if both LARS and AMP are turn on together. + def apply_gradients(self, params_grads): + return self.lars_opt.apply_gradients(params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, @@ -82,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase): no_grad_set=None): optimize_ops, params_grads = \ self.lars_opt.minimize(loss, startup_program, - parameter_list, no_grad_set) + parameter_list, no_grad_set) return optimize_ops, params_grads diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 3c1318301bb37bea71b896c220eb4a2090b334bf..6fa34d8d28a907d936500907db3e4c65ab4f4da8 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -29,6 +29,9 @@ class LocalSGDOptimizer(MetaOptimizerBase): self.snapshot_key = '@SNAPSHOT' def _can_apply(self): + if not self.role_maker._is_collective: + return False + if not self.user_defined_strategy.localsgd: return False @@ -36,17 +39,17 @@ class LocalSGDOptimizer(MetaOptimizerBase): return False return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \ - or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \ - or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \ - or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD) + or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \ + or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \ + or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD) def _disable_strategy(self, dist_strategy): dist_strategy.localsgd = False dist_strategy.localsgd_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): dist_strategy.localsgd = True - dist_strategy.localsgd_configs = {"k_steps": 1} + dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1} def snapshot_name(self, param_name): return param_name + self.snapshot_key @@ -83,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase): minimized = self.inner_opt.minimize( loss, startup_program=startup_program) - init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps'] - auto_steps = self.user_defined_strategy.auto + k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps'] + begin_step_value = self.user_defined_strategy.localsgd_configs[ + 'begin_step'] if startup_program is None: startup_program = default_startup_program() @@ -98,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase): p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): - step = layers.autoincreased_step_counter(begin=0) + step = layers.autoincreased_step_counter(begin=1) k_steps = layers.create_global_var( name="k_steps", shape=[1], - value=init_k_steps, + value=k_steps_value, dtype='int64', persistable=True) + + begin_step = layers.create_global_var( + name="begin_step", + shape=[1], + value=begin_step_value, + dtype='int64', + persistable=True) + last_step = layers.create_global_var( name="last_step", shape=[1], - value=int(0), + value=begin_step_value, dtype='int64', persistable=True) - if auto_steps: - avg_loss = layers.collective._c_allreduce( - loss) / self.role_maker.worker_num() - - lr_0 = layers.create_global_var( - name="lr_0", - shape=[1], - value=float(0), - dtype='float32', - persistable=True) - loss_0 = layers.create_global_var( - name="loss_0", - shape=[1], - value=float(0), - dtype='float32', - persistable=True) - - global_lr = self.inner_opt._global_learning_rate() - - def initialize(): - layers.assign(loss, loss_0) - layers.assign(global_lr, lr_0) - - layers.cond(step == 0, initialize) - def communicate(): sub_block = default_main_program().current_block() ring_id = -1 @@ -192,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase): inputs={'X': [param]}, outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) - - if auto_steps: - next_local_steps = layers.cast( - layers.ceil( - layers.sqrt(lr_0 * loss / (global_lr * loss_0) * - float(init_k_steps))), - dtype='int64') - max_local_steps = layers.fill_constant( - shape=[1], dtype='int64', value=16) - next_local_steps = layers.elementwise_min(next_local_steps, - max_local_steps) - layers.assign(next_local_steps, k_steps) layers.assign(step, last_step) - layers.cond(step - last_step == k_steps, communicate) + def begin_localsgd(): + layers.cond(step - last_step == k_steps, communicate) + layers.cond(step > begin_step, begin_localsgd, communicate) return minimized diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py index b105c25b3ad65c1c3a3fdac5b69af3c9e728c251..a12ca50442b1c3499d62216d1fecc709f3351382 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py +++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py @@ -48,7 +48,7 @@ class MetaOptimizerBase(Optimizer): raise NotImplementedError("you should implement disable strategy in {}". format(type(self).__name__)) - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context=None): raise NotImplementedError("you should implement enable strategy in {}". format(type(self).__name__)) diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py index c9260dd2f8c9d0d073b9453bc575cc3e2a8aa437..7dc532c86ea681d8479710732ec33e96c58c35d5 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py @@ -24,6 +24,9 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer): self.meta_optimizers_white_list = [] def _can_apply(self): + if self.role_maker._is_collective: + return False + k_steps = self.user_defined_strategy.a_sync_configs["k_steps"] if k_steps < 0: return False @@ -37,12 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer): return True def _disable_strategy(self, dist_strategy): - dist_strategy.a_sync_configs = {} + return - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): # only open up the async mode for auto-parallel - dist_strategy.a_sync = True - dist_strategy.a_sync_configs = {} + return def _is_graph_out(self): return True diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py index 7dca7b9cb88a37fb7954c3e8059a7ae09e2ef3a6..51d4d343165b9057c803a22aa428081109d7d35f 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py @@ -32,8 +32,6 @@ class ParameterServerOptimizer(MetaOptimizerBase): def _can_apply(self): if self.role_maker._is_collective: return False - if self.user_defined_strategy.auto == True: - return True k_steps = self.user_defined_strategy.a_sync_configs["k_steps"] return True if k_steps >= 0 else False @@ -134,7 +132,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): return _main, _startup - def _try_auto_apply_geo(self, program, compiled_config): + def _can_apply_geo(self, dist_strategy, program): def get_sys_free_mem(): plat = platform.system() if platform.system() == "Darwin": @@ -163,36 +161,28 @@ class ParameterServerOptimizer(MetaOptimizerBase): "%s platform is unsupported is parameter server optimizer" % (platform.system())) - if self.user_defined_strategy.auto == False: - return - - a_sync_configs = self.user_defined_strategy.a_sync_configs - if a_sync_configs["k_steps"] >= 0: - return - - self.user_defined_strategy.a_sync = True if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer): - # auto async - a_sync_configs["k_steps"] = 0 - self.user_defined_strategy.a_sync_configs = a_sync_configs - return + return False - from paddle.fluid.incubate.fleet.parameter_server.ir.vars_metatools import dtype_to_size free = get_sys_free_mem() - param_grad_pairs = compiled_config.origin_sparse_pairs + compiled_config.origin_dense_pairs - processed_var_names = set(["@EMPTY@"]) + from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools + processed_var_names = set(["@EMPTY@"]) param_memory_size = 0 - for param_grad_pair in param_grad_pairs: - param, grad = param_grad_pair + for varname in program.global_block().vars: + var = program.global_block().vars[varname] + if not var.persistable or var.desc.type( + ) != core.VarDesc.VarType.LOD_TENSOR: + continue + param = vars_metatools.create_var_struct(var) param_memory_size += param.m_size - processed_var_names.add(param.name) + processed_var_names.add(varname) upper_mem_use = param_memory_size * 5.0 program_tmp_vars = dict() - batch_size = 1024 + eval_batch_size = 1024 for op in program.global_block().ops: for var_name in op.output_arg_names: if var_name in processed_var_names: @@ -215,23 +205,21 @@ class ParameterServerOptimizer(MetaOptimizerBase): data_count *= (-x) else: data_count *= x - program_tmp_vars[var_name] = (data_count, neg_dim_count, - dtype_to_size[var.dtype]) + program_tmp_vars[var_name] = ( + data_count, neg_dim_count, + vars_metatools.dtype_to_size[var.dtype]) for varname in program_tmp_vars: data_count, neg_dim_count, type_size = program_tmp_vars[varname] if neg_dim_count == 1: - data_count *= batch_size + data_count *= eval_batch_size var_memory = data_count * type_size upper_mem_use += var_memory if upper_mem_use < free: - # auto geo - a_sync_configs["k_steps"] = 800 + return True else: - # auto async - a_sync_configs["k_steps"] = 0 - self.user_defined_strategy.a_sync_configs = a_sync_configs + return False def minimize_impl(self, loss, @@ -240,6 +228,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): no_grad_set=None): self.inner_opt.minimize(loss, startup_program, parameter_list, no_grad_set) + strategy = self._get_distributed_strategy() _origin_main_program = loss.block.program _origin_startup_program = startup_program @@ -247,11 +236,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): compiled_config = public.CompileTimeStrategy(_origin_main_program, _origin_startup_program, - None, self.role_maker) - - self._try_auto_apply_geo(_origin_main_program, compiled_config) - - strategy = self._get_distributed_strategy() + strategy, self.role_maker) compiled_config.strategy = strategy if self.role_maker.is_worker() or self.role_maker._is_heter_worker(): @@ -267,9 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase): return None, None def _disable_strategy(self, dist_strategy): - dist_strategy.a_sync_configs = {} - self.user_defined_strategy.a_sync_configs = {} + dist_strategy.a_sync = False + a_sync_configs = dist_strategy.a_sync_configs + a_sync_configs["k_steps"] = -1 + dist_strategy.a_sync_configs = a_sync_configs + + def _enable_strategy(self, dist_strategy, context): + a_sync_configs = dist_strategy.a_sync_configs + if a_sync_configs["k_steps"] >= 0: + return - def _enable_strategy(self, dist_strategy): dist_strategy.a_sync = True - dist_strategy.a_sync_configs = {} + a_sync_configs = dist_strategy.a_sync_configs + + is_geo = self._can_apply_geo(dist_strategy, + context["origin_main_program"]) + + if is_geo: + a_sync_configs["k_steps"] = 800 + else: + a_sync_configs["k_steps"] = 0 + dist_strategy.a_sync_configs = a_sync_configs diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 32c54d44867cc1b081d97c8f86d88b6613b30c2f..87fa70779111ea485319f50b58901c605fffa23c 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase): self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches) def _can_apply(self): + if not self.role_maker._is_collective: + return False + if self.user_defined_strategy.pipeline == True: return True return False @@ -111,7 +114,7 @@ class PipelineOptimizer(MetaOptimizerBase): dist_strategy.pipeline = False dist_strategy.pipeline_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): # we do not support enable pipeline automatically right now return @@ -180,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase): grad = None for idx, op in reversed(list(enumerate(block.ops))): if is_backward_op(op) and \ - OP_ROLE_VAR_KEY in op.attr_names: + OP_ROLE_VAR_KEY in op.attr_names: op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] if len(op_role_var) == 0: continue diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py index 267656824c9acea2b85341ae284d8634c922a095..8f9595486922a37cff02d1ac96c1c4c2bbf4b0d5 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py @@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase): list(user_defined_strategy.recompute_configs["checkpoints"])) def _can_apply(self): + if self.role_maker._is_collective: + return False + if self.user_defined_strategy.recompute == True: if len(self.user_defined_strategy.recompute_configs[ "checkpoints"]) == 0: @@ -49,7 +52,7 @@ class RecomputeOptimizer(MetaOptimizerBase): dist_strategy.recompute = False dist_strategy.recompute_configs = {} - def _enable_strategy(self, dist_strategy): + def _enable_strategy(self, dist_strategy, context): # we do not support automatically recompute checkpoints currently return diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 898c7d295641863740288e3f4e1da39266bce183..d51cacd1a5cad53ef77b325e5380100c537e057e 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -1756,6 +1756,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): op_path_dict = dict() op_path = _find_op_path_(block, targets, inputs, block_no_grad_set, op_path_dict) + + # find no grad var by op_path + no_grad_vars = _find_no_grad_vars(block, op_path, targets, + block_no_grad_set) + block_no_grad_set.update(no_grad_vars) + no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) grad_to_var = dict() grad_info_map = dict() diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py new file mode 100644 index 0000000000000000000000000000000000000000..d4dc968ca0de44b01741bf1f1fbaac7a9a65287e --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py @@ -0,0 +1,124 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.data_feeder import check_variable_and_dtype, check_type +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import Variable + +__all__ = ['check_finite_and_unscale', 'update_loss_scaling'] + + +def check_finite_and_unscale(x, scale, name=None): + """ + Check if input X contains all finite data, if yes, scale it by input Scale. + + $$Out = X / scale$$ + + If any tensor in X contains Inf or Nan, the Out will generate a indicator. + FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of + Out should not be used, and its data may not be deterministic. + Otherwise, FoundInfinite will be 0 (False). + Args: + x(list|tuple): The input tensors of check_finite_and_unscale operator. + scale: The scale of check_finite_and_unscale operator. + """ + check_type(x, 'x', (tuple, list), 'check_finite_and_unscale') + for e in x: + check_variable_and_dtype(e, "x", ['float32', 'float64'], + 'check_finite_and_unscale') + + helper = LayerHelper("check_finite_and_unscale", **locals()) + found_inf = helper.create_variable_for_type_inference(dtype='bool') + + inputs = {'X': x, 'Scale': scale} + outputs = {'Out': x, 'FoundInfinite': found_inf} + helper.append_op( + type='check_finite_and_unscale', inputs=inputs, outputs=outputs) + + return x, found_inf + + +def update_loss_scaling(x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name=None): + """ + Update loss scaling according to overall gradients. If all gradients is + finite after incr_every_n_steps, loss scaling will increase by incr_ratio. + Otherwise, loss scaling will decrease by decr_ratio after + decr_every_n_nan_or_inf steps and each step some gradients are infinite. + + Args: + x(list|tuple): The input tensors of update_loss_scaling operator. + found_inf (Variable): A boolean variable indicates whether + there is any infinite gradient. + prev_loss_scaling (Variable): Previous loss scaling. + num_good_steps (Variable): A variable accumulates good steps in which + all gradients are finite. + num_bad_steps (Variable): A variable accumulates bad steps in which + some gradients are infinite. + incr_every_n_steps (int): A variable represents increasing loss + scaling every n consecutive steps with + finite gradients. + decr_every_n_nan_or_inf (int): A variable represents decreasing + loss scaling every n accumulated + steps with nan or inf gradients. + incr_ratio(float): The multiplier to use when increasing the loss + scaling. + decr_ratio(float): The less-than-one-multiplier to use when decreasing + loss scaling. + """ + + check_variable_and_dtype(prev_loss_scaling, "prev_loss_scaling", + ['float32', 'float64'], "update_loss_scaling") + check_type(x, 'x', (tuple, list), 'update_loss_scaling') + for e in x: + check_variable_and_dtype(e, "x", ['float32', 'float64'], + 'update_loss_scaling') + assert prev_loss_scaling.dtype == e.dtype, "The dtype of prev_loss_scaling should be equal to the dtype of x." + + helper = LayerHelper("update_loss_scaling", **locals()) + + inputs = { + 'X': x, + 'FoundInfinite': found_inf, + 'PrevLossScaling': prev_loss_scaling, + 'InGoodSteps': num_good_steps, + 'InBadSteps': num_bad_steps + } + + outputs = { + 'Out': x, + 'LossScaling': prev_loss_scaling, + 'OutGoodSteps': num_good_steps, + 'OutBadSteps': num_bad_steps + } + + attrs = { + 'incr_every_n_steps': incr_every_n_steps, + 'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf, + 'incr_ratio': incr_ratio, + 'decr_ratio': decr_ratio, + } + + helper.append_op( + type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs) + + return x diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index bfbd2700ae10bac4ad37462b5d7844b90dd05bbe..c9112ac849ce0506b7afd941b2213710e06bd1c6 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -17,9 +17,11 @@ from ... import default_startup_program from ... import layers from ... import unique_name from . import fp16_utils -from .fp16_utils import update_loss_scaling, rewrite_program +from .fp16_utils import rewrite_program from .fp16_utils import update_role_var_grad from .fp16_lists import AutoMixedPrecisionLists +from .amp_nn import check_finite_and_unscale +from .amp_nn import update_loss_scaling __all__ = ["decorate"] @@ -67,10 +69,8 @@ class OptimizerWithMixedPrecision(object): persistable=True) self._use_dynamic_loss_scaling = use_dynamic_loss_scaling if self._use_dynamic_loss_scaling: - self._incr_every_n_steps = layers.fill_constant( - shape=[1], dtype='int32', value=incr_every_n_steps) - self._decr_every_n_nan_or_inf = layers.fill_constant( - shape=[1], dtype='int32', value=decr_every_n_nan_or_inf) + self._incr_every_n_steps = incr_every_n_steps + self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf self._incr_ratio = incr_ratio self._decr_ratio = decr_ratio self._num_good_steps = layers.create_global_var( @@ -139,49 +139,46 @@ class OptimizerWithMixedPrecision(object): # Change the op_role_var attr for some ops, so that gradients # transferred across GPUs can be FP16. update_role_var_grad(self._train_program, self._params_grads) - scaled_params_grads = [] - for p, g in self._params_grads: - with self._train_program._optimized_guard([p, g]): - scaled_g = g / self._loss_scaling - scaled_params_grads.append([p, scaled_g]) - return scaled_params_grads + return self._params_grads - def apply_gradients(self, scaled_params_grads): + def apply_gradients(self, params_grads): """ Check scaled gradients to determine whether to update loss scaling and update parameters by their scaled gradients, Args: - scaled_params_grads (list): A list of params and scaled grads. + params_grads (list): A list of params and scaled grads. Returns: A list of optimize operators. """ - if self._use_dynamic_loss_scaling: + grads = [g for _, g in params_grads] + with self._train_program._optimized_guard(grads): + grads, found_inf = check_finite_and_unscale( + grads, self._loss_scaling, name="find_infinite_scale") - grads = [layers.reduce_sum(g) for [_, g] in scaled_params_grads] - all_grads = layers.concat(grads) - all_grads_sum = layers.reduce_sum(all_grads) - is_overall_finite = layers.isfinite(all_grads_sum) - - update_loss_scaling(is_overall_finite, self._loss_scaling, - self._num_good_steps, self._num_bad_steps, - self._incr_every_n_steps, - self._decr_every_n_nan_or_inf, self._incr_ratio, - self._decr_ratio) - - # apply_gradient append all ops in global block, thus we shouldn't - # apply gradient in the switch branch. - with layers.Switch() as switch: - with switch.case(is_overall_finite): - pass - with switch.default(): - for _, g in scaled_params_grads: - layers.assign(layers.zeros_like(g), g) - - optimize_ops = self._optimizer.apply_gradients(scaled_params_grads) + if self._use_dynamic_loss_scaling: + with self._train_program._optimized_guard(grads): + grads = update_loss_scaling( + grads, + found_inf, + self._loss_scaling, + self._num_good_steps, + self._num_bad_steps, + self._incr_every_n_steps, + self._decr_every_n_nan_or_inf, + self._incr_ratio, + self._decr_ratio, + name="update_loss_scaling") + + params_unscaled_grads = [] + for pg, new_g in zip(params_grads, grads): + params_unscaled_grads.append((pg[0], new_g)) + # apply_gradient append all ops in global block, thus we shouldn't + # apply gradient in the switch branch. + optimize_ops = self._optimizer.apply_gradients(params_unscaled_grads) return optimize_ops diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 328dafe6219adb3c6355de0bafc430c52725024f..0b142ff33de55f36410eb9c23cb75210fc9d6321 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -328,77 +328,3 @@ def update_role_var_grad(main_prog, params_grads): raise ValueError("The op {0} is not in program".format(op)) block.desc._remove_op(op_idx, op_idx + 1) block._sync_with_cpp() - - -def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, - num_bad_steps, incr_every_n_steps, - decr_every_n_nan_or_inf, incr_ratio, decr_ratio): - """ - Update loss scaling according to overall gradients. If all gradients is - finite after incr_every_n_steps, loss scaling will increase by incr_ratio. - Otherwise, loss scaling will decrease by decr_ratio after - decr_every_n_nan_or_inf steps and each step some gradients are infinite. - - Args: - is_overall_finite (Variable): A boolean variable indicates whether - all gradients are finite. - prev_loss_scaling (Variable): Previous loss scaling. - num_good_steps (Variable): A variable accumulates good steps in which - all gradients are finite. - num_bad_steps (Variable): A variable accumulates bad steps in which - some gradients are infinite. - incr_every_n_steps (Variable): A variable represents increasing loss - scaling every n consecutive steps with - finite gradients. - decr_every_n_nan_or_inf (Variable): A variable represents decreasing - loss scaling every n accumulated - steps with nan or inf gradients. - incr_ratio(float): The multiplier to use when increasing the loss - scaling. - decr_ratio(float): The less-than-one-multiplier to use when decreasing - loss scaling. - """ - zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0) - with layers.Switch() as switch: - with switch.case(is_overall_finite): - should_incr_loss_scaling = layers.less_than(incr_every_n_steps, - num_good_steps + 1) - with layers.Switch() as switch1: - with switch1.case(should_incr_loss_scaling): - new_loss_scaling = prev_loss_scaling * incr_ratio - loss_scaling_is_finite = layers.isfinite(new_loss_scaling) - with layers.Switch() as switch2: - with switch2.case(loss_scaling_is_finite): - layers.assign(new_loss_scaling, prev_loss_scaling) - with switch2.default(): - pass - layers.assign(zero_steps, num_good_steps) - layers.assign(zero_steps, num_bad_steps) - - with switch1.default(): - layers.increment(num_good_steps) - layers.assign(zero_steps, num_bad_steps) - - with switch.default(): - should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf, - num_bad_steps + 1) - with layers.Switch() as switch3: - with switch3.case(should_decr_loss_scaling): - new_loss_scaling = prev_loss_scaling * decr_ratio - static_loss_scaling = \ - layers.fill_constant(shape=[1], - dtype='float32', - value=1.0) - less_than_one = layers.less_than(new_loss_scaling, - static_loss_scaling) - with layers.Switch() as switch4: - with switch4.case(less_than_one): - layers.assign(static_loss_scaling, - prev_loss_scaling) - with switch4.default(): - layers.assign(new_loss_scaling, prev_loss_scaling) - layers.assign(zero_steps, num_good_steps) - layers.assign(zero_steps, num_bad_steps) - with switch3.default(): - layers.assign(zero_steps, num_good_steps) - layers.increment(num_bad_steps) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 5662284483bf529034e42178c8a431f6286e31b8..8d399c929018f08eb3d02e50981566705536bbf5 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -192,7 +192,6 @@ class ImperativeQuantAware(object): assert len(input_dtype) == len( feed), "The length of input_shape should be equal to feed's." - prog_trans = dygraph.ProgramTranslator() with dygraph.guard(): model.eval() input_vars = [] diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py index 59dd9867abb95dea74e1cdc362b671e7d4120d70..e22c980b0a7c6030c5d6a2fbc4fd58d2ec66958a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py @@ -209,15 +209,24 @@ class FakeQuantAbsMax(layers.Layer): return quant_out -def _get_fake_quant_type(quant_type, name, moving_rate, quant_bits, dtype, - quant_on_weight): +def _get_fake_quant_type(quant_type, **kwargs): + call_args = { + "name": kwargs.get("name", None), + "quant_bits": kwargs.get("quant_bits", 8), + "dtype": kwargs.get("dtype", "float32") + } + + if quant_type == 'abs_max': + call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False) + elif quant_type == 'moving_average_abs_max': + call_args["moving_rate"] = kwargs.get("moving_rate", 0.9) + fake_quant_map = { - 'abs_max': - lambda: FakeQuantAbsMax(name, quant_bits, dtype, quant_on_weight), - 'moving_average_abs_max': - lambda: FakeQuantMovingAverage(name, moving_rate, quant_bits, dtype) + 'abs_max': FakeQuantAbsMax, + 'moving_average_abs_max': FakeQuantMovingAverage } - return fake_quant_map[quant_type]() + + return fake_quant_map[quant_type](**call_args) class QuantizedConv2D(layers.Layer): @@ -247,11 +256,18 @@ class QuantizedConv2D(layers.Layer): self.bias = getattr(layer, 'bias') # For FakeQuant self._fake_quant_weight = _get_fake_quant_type( - weight_quantize_type, self.weight.name, moving_rate, weight_bits, - self._dtype, True) + weight_quantize_type, + name=self.weight.name, + moving_rate=moving_rate, + quant_bits=weight_bits, + dtype=self._dtype, + quant_on_weight=True) self._fake_quant_input = _get_fake_quant_type( activation_quantize_type, - layer.full_name(), moving_rate, activation_bits, self._dtype, False) + name=layer.full_name(), + moving_rate=moving_rate, + quant_bits=activation_bits, + dtype=self._dtype) def forward(self, input): quant_input = self._fake_quant_input(input) @@ -326,11 +342,18 @@ class QuantizedLinear(layers.Layer): self.bias = getattr(layer, 'bias') # For FakeQuant self._fake_quant_weight = _get_fake_quant_type( - weight_quantize_type, self.weight.name, moving_rate, weight_bits, - self._dtype, True) + weight_quantize_type, + name=self.weight.name, + moving_rate=moving_rate, + quant_bits=weight_bits, + dtype=self._dtype, + quant_on_weight=True) self._fake_quant_input = _get_fake_quant_type( activation_quantize_type, - layer.full_name(), moving_rate, activation_bits, self._dtype, False) + name=layer.full_name(), + moving_rate=moving_rate, + quant_bits=activation_bits, + dtype=self._dtype) def forward(self, input): quant_input = self._fake_quant_input(input) diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 6a996493e4df1e1facc6ccd205a8ae5105f92c5b..1ef0d494e0725084b0ddfddcafe93d49da0525d7 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -347,6 +347,92 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): return self.__next__() +# NOTE(chenweihang): _worker_loop must be top level method to be pickled +def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, + collate_fn, init_fn, worker_id, num_workers, + use_shared_memory): + try: + # NOTE: [ mmap files clear ] When the child process exits unexpectedly, + # some shared memory objects may have been applied for but have not yet + # been put into the inter-process Queue. This part of the object needs + # to be cleaned up when the process ends. + CleanupFuncRegistrar.register(_cleanup_mmap) + + # set signal handler + core._set_process_signal_handler() + + global _worker_info + _worker_info = WorkerInfo( + id=worker_id, num_workers=num_workers, dataset=dataset) + + init_exception = None + try: + if init_fn is not None: + init_fn(worker_id) + fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, + collate_fn, True) + except: + init_exception = Exception("init_fn failed in worker {}: " \ + "{}".format(worker_id, sys.exc_info())) + + iterator_drained = False + parent_watch_dog = ParentWatchDog() + + while parent_watch_dog.is_alive(): + try: + data = indices_queue.get(MP_INDICES_CHECK_INTERVAL) + except queue.Empty: + continue + + # None as poison piil, so worker event should be set + if data is None: + assert done_event.is_set() or iterator_drained, \ + "get None when worker done_event set" + break + # If worker done event is set but get still get data in + # indices_queue, remaining data should be get and skipped. + if done_event.is_set() or iterator_drained: + continue + + idx, indices = data + try: + if init_exception is not None: + batch = init_exception + init_exception = None + else: + batch = fetcher.fetch(indices) + except Exception as e: + if isinstance( + e, StopIteration) and dataset_kind == _DatasetKind.ITER: + out_queue.put(_IterableDatasetStopIteration(worker_id)) + iterator_drained = True + else: + out_queue.put((idx, e)) + else: + if use_shared_memory: + # FIXME(dkp): _convert_to_tensor_list only support np.array + # list now, should support paddle.Tensor list + if isinstance(batch[0][0], paddle.Tensor): + np_batch = [] + for sample in batch: + np_batch.append([s.numpy() for s in sample]) + batch = np_batch + + tensor_list = core._convert_to_tensor_list(batch) + out_queue.put((idx, tensor_list)) + core._remove_tensor_list_mmap_fds(tensor_list) + else: + out_queue.put((idx, batch)) + except KeyboardInterrupt: + # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process + pass + except: + six.reraise(*sys.exc_info()) + finally: + if use_shared_memory: + _cleanup_mmap() + + class _DataLoaderIterMultiProcess(_DataLoaderIterBase): def __init__(self, loader): super(_DataLoaderIterMultiProcess, self).__init__(loader) @@ -404,11 +490,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): indices_queue = multiprocessing.Queue() self._indices_queues.append(indices_queue) worker = multiprocessing.Process( - target=self._worker_loop, + target=_worker_loop, args=(self._dataset, self._dataset_kind, indices_queue, self._data_queue, self._workers_done_event, self._collate_fn, self._worker_init_fn, i, - self._num_workers)) + self._num_workers, self._use_shared_memory)) worker.daemon = True worker.start() self._workers.append(worker) @@ -483,90 +569,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._blocking_queue.kill() logging.error("DataLoader reader thread raised an exception!") - def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue, - done_event, collate_fn, init_fn, worker_id, num_workers): - try: - # NOTE: [ mmap files clear ] When the child process exits unexpectedly, - # some shared memory objects may have been applied for but have not yet - # been put into the inter-process Queue. This part of the object needs - # to be cleaned up when the process ends. - CleanupFuncRegistrar.register(_cleanup_mmap) - - # set signal handler - core._set_process_signal_handler() - - global _worker_info - _worker_info = WorkerInfo( - id=worker_id, num_workers=num_workers, dataset=dataset) - - init_exception = None - try: - if init_fn is not None: - init_fn(worker_id) - fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, - collate_fn, True) - except: - init_exception = Exception("init_fn failed in worker {}: " \ - "{}".format(worker_id, sys.exc_info())) - - iterator_drained = False - parent_watch_dog = ParentWatchDog() - - while parent_watch_dog.is_alive(): - try: - data = indices_queue.get(MP_INDICES_CHECK_INTERVAL) - except queue.Empty: - continue - - # None as poison piil, so worker event should be set - if data is None: - assert done_event.is_set() or iterator_drained, \ - "get None when worker done_event set" - break - # If worker done event is set but get still get data in - # indices_queue, remaining data should be get and skipped. - if done_event.is_set() or iterator_drained: - continue - - idx, indices = data - try: - if init_exception is not None: - batch = init_exception - init_exception = None - else: - batch = fetcher.fetch(indices) - except Exception as e: - if isinstance( - e, - StopIteration) and dataset_kind == _DatasetKind.ITER: - out_queue.put(_IterableDatasetStopIteration(worker_id)) - iterator_drained = True - else: - out_queue.put((idx, e)) - else: - if self._use_shared_memory: - # FIXME(dkp): _convert_to_tensor_list only support np.array - # list now, should support paddle.Tensor list - if isinstance(batch[0][0], paddle.Tensor): - np_batch = [] - for sample in batch: - np_batch.append([s.numpy() for s in sample]) - batch = np_batch - - tensor_list = core._convert_to_tensor_list(batch) - out_queue.put((idx, tensor_list)) - core._remove_tensor_list_mmap_fds(tensor_list) - else: - out_queue.put((idx, batch)) - except KeyboardInterrupt: - # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process - pass - except: - six.reraise(*sys.exc_info()) - finally: - if self._use_shared_memory: - _cleanup_mmap() - def _thread_loop(self): while not self._thread_done_event.is_set(): batch = self._get_data() diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 8f3ca9ec007ef5c1ab8769dde741a5d2b3697600..ff57f30dcd2ec73d55ff06e751767deea0a2eead 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -210,13 +210,12 @@ class AmpScaler(object): def _unscale(self, optimizer): if not self._enable: return - inv_scale = 1.0 / self._scale param_grads = [ param._grad_ivar() for param in optimizer._parameter_list if param._grad_ivar() is not None ] - core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads, - self._found_inf) + core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads, + self._found_inf) def _update(self): """ diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index c548bdfeba19510b26c0f80d356fa6a6b7bbaed7..2f95c2b9007a53483fda86dda8d77e9baff0d8d2 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -25,6 +25,7 @@ from .tracer import Tracer import logging import objgraph from ..data_feeder import convert_dtype +import warnings __all__ = [ 'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', @@ -609,10 +610,10 @@ def to_variable(value, name=None, zero_copy=None, dtype=None): uint8, uint16, complex64, complex128}. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please - refer to :ref:`api_guide_Name` . + refer to :ref:`api_guide_Name` . zero_copy(bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace and will be set to - True when it is None. Default: None. + True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.) dtype(str, optional): The desired data type of returned ``Variable`` . Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8' . Default: None. @@ -665,8 +666,17 @@ def to_variable(value, name=None, zero_copy=None, dtype=None): else: if isinstance(framework._current_expected_place(), framework.core.CPUPlace): - if zero_copy is None: - zero_copy = True + #TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace. + # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy. + # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html + # (2): when used in flask framework, it may result in hang. + # Details: https://github.com/PaddlePaddle/Paddle/issues/26635 + # So, we temporally diable the zero_copy strategy. + if zero_copy == True: + warnings.warn( + "Currently, zero_copy is not supported, and it will be discarded." + ) + zero_copy = False else: assert not zero_copy, "zero_copy mode can only be used with CPUPlace" diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index 30ded1f7eda295bab5567a082ba1fa3989b55fa2..9876fc620b870f47b10e9f99e4de34f5cb81fde1 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -25,7 +25,7 @@ import warnings from .. import core from .base import guard from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs -from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers +from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME __all__ = [ 'save_dygraph', @@ -233,6 +233,19 @@ def load_dygraph(model_path, config=None): para_dict = dict() for var_name in persistable_var_dict: para_dict[var_name] = persistable_var_dict[var_name].numpy() + + # if __variables.info__ exists, we can recover structured_name + var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME) + if os.path.exists(var_info_path): + with open(var_info_path, 'rb') as f: + extra_var_info = pickle.load(f) + structured_para_dict = dict() + for var_name in para_dict: + structured_name = extra_var_info[var_name].get( + 'structured_name', None) + assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name + structured_para_dict[structured_name] = para_dict[var_name] + para_dict = structured_para_dict else: # Load state dict by `save_dygraph` save format para_dict = {} diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py index 5aba7ca0fdc0cfda5d79f5a66d78785df49c0baf..be21ab6d5394ed5f89c23988a9405b57e05b56fb 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import six import sys import traceback @@ -20,6 +21,14 @@ from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginI ERROR_DATA = "Error data about original source code information and traceback." +# A flag to set whether to open the dygraph2static error reporting module +SIMPLIFY_ERROR_ENV_NAME = "TRANSLATOR_SIMPLIFY_NEW_ERROR" +DEFAULT_SIMPLIFY_NEW_ERROR = 1 + +# A flag to set whether to display the simplified error stack +DISABLE_ERROR_ENV_NAME = "TRANSLATOR_DISABLE_NEW_ERROR" +DEFAULT_DISABLE_NEW_ERROR = 0 + def attach_error_data(error, in_runtime=False): """ @@ -103,7 +112,10 @@ class ErrorData(object): # Simplify error value to improve readability if error is raised in runtime if self.in_runtime: - self._simplify_error_value() + if int( + os.getenv(SIMPLIFY_ERROR_ENV_NAME, + DEFAULT_SIMPLIFY_NEW_ERROR)): + self._simplify_error_value() message_lines.append(str(self.error_value)) return '\n'.join(message_lines) @@ -150,3 +162,22 @@ class ErrorData(object): error_value_str = '\n'.join(error_value_lines) self.error_value = self.error_type(error_value_str) + + def raise_new_exception(self): + + # Raises the origin error if disable dygraph2static error module, + if int(os.getenv(DISABLE_ERROR_ENV_NAME, DEFAULT_DISABLE_NEW_ERROR)): + raise + + new_exception = self.create_exception() + if six.PY3: + # NOTE(liym27): + # 1. Why `raise new_exception from None`? + # In Python 3, by default, an new exception is raised with trace information of the caught exception. + # This only raises new_exception and hides unwanted implementation details from tracebacks of the + # caught exception. + # 2. Use exec to bypass syntax error checking in Python 2. + + six.exec_("raise new_exception from None") + else: + raise new_exception diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py index 13f38b0726c27566ff0eda41d6c365e6a7e4aa4b..76e732d4d37f6a2056afba72649077acf16ba30e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py @@ -124,8 +124,13 @@ class OriginInfoAttacher(gast.NodeTransformer): def _abs_lineno(self, node): # NOTE(liym27): - # If the first gast.FunctionDef has decorator, its lineno is 1, which - # equals to the lineno of the first decorator node. + # There are differences in ast_node.lineno between PY3.8+ and PY3.8-. + # If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs. + # 1. < PY3.8 + # its lineno equals to the lineno of the first decorator node, which is not right. + # 2. >= PY3.8 + # its lineno is the actual lineno, which is right. + return self.lineno_offset + node.lineno def _abs_col_offset(self, node): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py index 3d27810f1db94c4f6c273399ec93b9335f5bb03a..e5fce3e6ede1511458f8da916165738d9e842d1a 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py @@ -32,8 +32,7 @@ from paddle.fluid.layers.utils import flatten from paddle.fluid.dygraph.base import param_guard from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst -from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA -from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data +from paddle.fluid.dygraph.dygraph_to_static import error from paddle.fluid.dygraph.dygraph_to_static import logging_utils from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map @@ -315,6 +314,7 @@ class StaticLayer(object): # 2. trace ops from dygraph layers and cache the generated program. args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs) + try: concrete_program, partial_program_layer = self.get_concrete_program( *args, **kwargs) @@ -324,27 +324,22 @@ class StaticLayer(object): partial_program_layer.training = self._class_instance.training # 4. return outputs. - return partial_program_layer(args) + try: + return partial_program_layer(args) + except Exception as e: + if not hasattr(e, error.ERROR_DATA): + # runtime error + error.attach_error_data(e, in_runtime=True) + raise except Exception as e: - if not hasattr(e, ERROR_DATA): - # runtime error - attach_error_data(e, in_runtime=True) - error_data = getattr(e, ERROR_DATA, None) + error_data = getattr(e, error.ERROR_DATA, None) if error_data: - new_exception = error_data.create_exception() - if six.PY3: - # NOTE(liym27): - # 1. Why `raise new_exception from None`? - # In Python 3, by default, an new exception is raised with trace information of the caught exception. - # This only raises new_exception and hides unwanted implementation details from tracebacks of the - # caught exception. - # 2. Use exec to bypass syntax error checking in Python 2. - - six.exec_("raise new_exception from None") - else: - raise new_exception + error_data.raise_new_exception() else: - raise + logging_utils.warn( + "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'" + " if you can't handle this {} yourself.".format(type(e))) + raise e def _call_dygraph_function(self, *args, **kwargs): """ @@ -593,7 +588,7 @@ class ConcreteProgram(object): outputs = static_func(*inputs) except BaseException as e: # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here. - attach_error_data(e) + error.attach_error_data(e) raise if not isinstance(outputs, @@ -813,28 +808,36 @@ class ProgramTranslator(object): "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. " "We will just return dygraph output.") return dygraph_func(*args, **kwargs) - - function_spec = FunctionSpec(dygraph_func) - cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs, - getattr(dygraph_func, - '__self__', None)) - _, partial_program_layer = self._program_cache[cache_key] - - if args and isinstance(args[0], layers.Layer): - # Synchronize self.training attribute. - partial_program_layer.training = args[0].training - args = args[1:] try: - return partial_program_layer(args) - + function_spec = FunctionSpec(dygraph_func) + cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs, + getattr(dygraph_func, + '__self__', None)) + _, partial_program_layer = self._program_cache[cache_key] + + if args and isinstance(args[0], layers.Layer): + # Synchronize self.training attribute. + partial_program_layer.training = args[0].training + args = args[1:] + try: + return partial_program_layer(args) + except BaseException as e: + # NOTE: + # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before; + # 2. If e raised in runtime, e should be attached to ERROR_DATA here. + if not hasattr(e, error.ERROR_DATA): + # runtime error + error.attach_error_data(e, in_runtime=True) + raise except BaseException as e: - # NOTE: - # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before; - # 2. If e raised in runtime, e should be attached to ERROR_DATA here. - if not hasattr(e, ERROR_DATA): - # runtime error - attach_error_data(e, in_runtime=True) - raise + error_data = getattr(e, error.ERROR_DATA, None) + if error_data: + error_data.raise_new_exception() + else: + logging_utils.warn( + "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'" + " if you can't handle this {} yourself.".format(type(e))) + raise e def get_func(self, dygraph_func): """ diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index 1d2ea142c7d5f2e653e446986a39d1bc155006f0..335ac500c898085e4bf60aabdf8db95fa65db31f 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -479,11 +479,15 @@ def _load_persistable_vars(model_path, var_file_path = os.path.join(model_path, params_filename) else: var_file_path = os.path.join(model_path, VARIABLE_FILENAME) - framework._dygraph_tracer().trace_op( - type='load_combine', - inputs={}, - outputs={'Out': load_var_list}, - attrs={'file_path': var_file_path}) + if not os.path.exists(var_file_path): + if len(extra_var_info) != 0: + raise ValueError("The model to be loaded is incomplete.") + else: + framework._dygraph_tracer().trace_op( + type='load_combine', + inputs={}, + outputs={'Out': load_var_list}, + attrs={'file_path': var_file_path}) return load_var_dict diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index ad2088116243e3a0a75fb43b9bb34b19456c84de..57864efec8a9447cca0be94f0f1b433c18435376 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -18,6 +18,7 @@ import os import pickle import warnings import functools +from collections import OrderedDict import six import paddle @@ -211,7 +212,16 @@ def declarative(function=None, input_spec=None): # for usage: `declarative(foo, ...)` if function is not None: - return decorated(function) + if isinstance(function, Layer): + if isinstance(function.forward, StaticLayer): + class_name = function.__class__.__name__ + warnings.warn( + "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.". + format(class_name)) + function.forward = decorated(function.forward) + return function + else: + return decorated(function) # for usage: `@declarative` return decorated @@ -633,6 +643,73 @@ class SaveLoadConfig(object): self._keep_name_table = value +def _get_input_var_names(inputs, input_spec): + name_none_error = "The %s's name is None. " \ + "When using jit.save, please set InputSepc's name in " \ + "to_static(input_spec=[]) and jit.save(input_spec=[]) " \ + "and make sure they are consistent." + name_no_exists_error = "The tensor `%s` does not exists. " \ + "Please make sure the name of InputSpec or example Tensor " \ + "in input_spec is the same as the name of InputSpec in " \ + "`to_static` decorated on the Layer.forward method." + result_list = [] + input_var_names = [var.name for var in inputs if isinstance(var, Variable)] + if input_spec is None: + # no prune + result_list = input_var_names + elif input_spec is not None and len(input_spec) == len(input_var_names): + # no prune + result_list = input_var_names + # if input spec name not in input_var_names, only raise warning + for spec in input_spec: + if spec.name is None: + warnings.warn(name_none_error % spec) + elif spec.name not in input_var_names: + warnings.warn(name_no_exists_error % spec.name) + else: + # do nothing + pass + else: + # prune + for spec in input_spec: + if spec.name is None: + # name is None, the input_spec only can be InputSpec + raise ValueError(name_none_error % spec) + elif spec.name not in input_var_names: + # the input_spec can be `InputSpec` or `VarBase` + raise ValueError(name_no_exists_error % spec.name) + else: + result_list.append(spec.name) + + return result_list + + +def _get_output_vars(outputs, output_spec): + name_no_exists_error = "The tensor `%s` does not exists. " \ + "Please make sure the name of example Tensor " \ + "in configs.output_spec is the output tensor of " \ + "Layer.forward method." + result_list = [] + output_vars_dict = OrderedDict() + for var in outputs: + if isinstance(var, Variable): + output_vars_dict[var.name] = var + if output_spec is None: + result_list = output_vars_dict.values() + elif output_spec is not None and len(output_spec) == len(output_vars_dict): + result_list = output_vars_dict.values() + for var in output_spec: + if var.name not in output_vars_dict: + warnings.warn(name_no_exists_error % var.name) + else: + for var in output_spec: + if var.name not in output_vars_dict: + raise ValueError(name_no_exists_error % var.name) + else: + result_list.append(output_vars_dict[var.name]) + return result_list + + # NOTE(chenweihang): change jit.save/load argument `configs` to `config` def deprecate_save_load_configs(func): @functools.wraps(func) @@ -753,26 +830,6 @@ def save(layer, model_path, input_spec=None, config=None): paddle.jit.save(layer, model_path) """ - def get_inout_spec(all_vars, target_vars, return_name=False): - result_list = [] - valid_var_dict = {} - valid_vars = [var for var in all_vars if isinstance(var, Variable)] - for var in valid_vars: - valid_var_dict[var.name] = var - if target_vars: - for i, var in enumerate(target_vars): - # check target var whether exists - if var.name not in valid_var_dict: - raise RuntimeError( - "The variable to feed/fetch are not exist.") - result_list.append(valid_var_dict[var.name]) - else: - result_list = valid_vars - if return_name: - result_list = [var.name for var in result_list] - - return result_list - # 1. input check prog_translator = ProgramTranslator() if not prog_translator.enable: @@ -788,25 +845,58 @@ def save(layer, model_path, input_spec=None, config=None): if configs is None: configs = SaveLoadConfig() + # avoid change user given input_spec + inner_input_spec = None if input_spec is not None: if not isinstance(input_spec, list): raise TypeError( "The input input_spec should be 'list', but received input_spec's type is %s." % type(input_spec)) + inner_input_spec = [] for var in input_spec: - if not isinstance(var, (core.VarBase, Variable, - paddle.static.InputSpec)): + if isinstance(var, paddle.static.InputSpec): + inner_input_spec.append(var) + elif isinstance(var, (core.VarBase, Variable)): + inner_input_spec.append( + paddle.static.InputSpec.from_tensor(var)) + else: raise TypeError( "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s." % type(var)) - # 2. get program of declarative Layer.forward - if not isinstance(layer.forward, StaticLayer): - raise RuntimeError( - "layer.forward need to be decorated by `@declarative`.") - concrete_program = layer.forward.concrete_program - - # NOTE: we maintain the mapping of variable name to + # 2. get program from Layer + # TODO(chenweihang): add support for other method, not only forward + if isinstance(layer.forward, StaticLayer): + concrete_program = layer.forward.concrete_program + else: + # transform in jit.save, if input_spec is incomplete, declarative will throw error + static_forward = declarative(layer.forward, input_spec=inner_input_spec) + concrete_program = static_forward.concrete_program + # the input_spec has been used in declarative, which is equal to + # @declarative with input_spec and jit.save without input_spec, + # avoid needless warning + inner_input_spec = None + + # 3. build input & output of save_infernece_model + # NOTE(chenweihang): [ Get input variables name ] + # There are two cases, whether to prune the inputs or not + # - not prune inputs (recommend): + # - the len(input_spec) == len((concrete_program.inputs) - 1 + # - here can use concrete_program.inputs directly + # - prune inputs: + # - the input_spec length < len((concrete_program.inputs) - 1 + # - the input_spec's name should be in concrete_program.inputs + input_var_names = _get_input_var_names(concrete_program.inputs, + inner_input_spec) + + # NOTE(chenweihang): [ Get output variables ] + # the rule is like [ Get input variables name ]. For output var, + # we only support VarBase spec, and actually, we only need the + # var name of output, and we don't recommended to use output_spec + output_vars = _get_output_vars(concrete_program.outputs, + configs.output_spec) + + # NOTE(chenweihang): we maintain the mapping of variable name to # structured name, the buffer variable (non-persistable) # saved to inference program may not need by dygraph Layer, # we only record the state_dict variable's structured name @@ -814,7 +904,7 @@ def save(layer, model_path, input_spec=None, config=None): for structured_name, var in six.iteritems(layer.state_dict()): state_names_dict[var.name] = structured_name - # 3. share parameters from Layer to scope & record var info + # 4. share parameters from Layer to scope & record var info scope = core.Scope() extra_var_info = dict() for param_or_buffer in concrete_program.parameters: @@ -832,10 +922,6 @@ def save(layer, model_path, input_spec=None, config=None): extra_info_dict['trainable'] = param_or_buffer.trainable extra_var_info[param_or_buffer.name] = extra_info_dict - # 4. build input & output spec - input_var_names = get_inout_spec(concrete_program.inputs, input_spec, True) - output_vars = get_inout_spec(concrete_program.outputs, configs.output_spec) - # 5. save inference model from paddle.fluid.io import save_inference_model @@ -856,7 +942,7 @@ def save(layer, model_path, input_spec=None, config=None): export_for_deployment=configs._export_for_deployment, program_only=configs._program_only) - # NOTE: [ Save extra variable info ] + # NOTE(chenweihang): [ Save extra variable info ] # save_inference_model will lose some important variable information, including: # - Variable name and correspondence (when saved variables as one file) # - Variable.stop_gradient information diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index 8c4109674200bf97354444f92f00b13e053152a0..f9fe4198fec3a0a2237c0bcac6e20f4269160589 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -19,7 +19,6 @@ from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator from ..layers.layer_function_generator import OpProtoHolder from ..layers import common_methods from . import to_variable, no_grad -import paddle import numpy as np import six @@ -163,26 +162,6 @@ def monkey_patch_math_varbase(): def _scalar_div_(var, value): return _scalar_elementwise_op_(var, 1.0 / value, 0.0) - # TODO(shenliang03): currently, it supports divide, floor_divide, remainder - # for binary operator by using the api to achieve the type promotion - def _binary_method_creator_(op_type, reverse=False): - import paddle - - def __impl__(self, other_var): - import paddle - op = getattr(paddle, op_type) - if reverse: - return op(other_var, self) - else: - return op(self, other_var) - - __impl__.__doc__ = """ - - See paddle.{}""".format(op_type) - __impl__.__name__ = op_type - - return __impl__ - # for binary operator such as elementwise, compare def _binary_creator_(method_name, op_type, @@ -281,20 +260,22 @@ def monkey_patch_math_varbase(): ## a*b == b*a. Do not need to reverse explicitly ('__rmul__', _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)), + ('__div__', _binary_creator_('__div__', 'elementwise_div', False, + _scalar_div_)), + ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div', + False, _scalar_div_)), + ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True, + None)), ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True, None)), ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False, None)), ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True, None)), - # These binary use paddle.optype - ('__div__', _binary_method_creator_('divide', False)), - ('__truediv__', _binary_method_creator_('divide', False)), - ('__rtruediv__', _binary_method_creator_('divide', True)), - ('__rdiv__', _binary_method_creator_('divide', True)), - ('__floordiv__', _binary_method_creator_('floor_divide', False)), - ('__rfloordiv__', _binary_method_creator_('floor_divide', True)), - ('__mod__', _binary_method_creator_('remainder', False)), + ('__floordiv__', _binary_creator_('__floordiv__', + 'elementwise_floordiv', False, None)), + ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False, + None)), ## for logical compare ('__eq__', _binary_creator_('__eq__', 'equal', False, None)), ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)), diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py index 38fc34472c8bc64338e2468bdf3f4b0bab1370ce..4595f0cf93916d71a3d0ec582af1917500d68f12 100644 --- a/python/paddle/fluid/layers/math_op_patch.py +++ b/python/paddle/fluid/layers/math_op_patch.py @@ -16,7 +16,6 @@ from __future__ import print_function import warnings import inspect -import paddle from .. import core from ..framework import Variable, unique_name @@ -46,7 +45,6 @@ EXPRESSION_MAP = { "__pow__": "A ** B", "__rpow__": "A **= B", "__floordiv__": "A //B", - "__rfloordiv__": "A //= B", "__mod__": "A % B", "__eq__": "A == B", "__ne__": "A != B", @@ -235,25 +233,6 @@ def monkey_patch_variable(): def _scalar_div_(var, value): return _scalar_op_(var, 1.0 / value, 0.0) - # TODO(shenliang03): currently, it supports divide, floor_divide, remainder - # for binary operator by using the api to achieve the type promotion - def _binary_method_creator_(op_type, reverse=False): - import paddle - - def __impl__(self, other_var): - op = getattr(paddle, op_type) - if reverse: - return op(other_var, self) - else: - return op(self, other_var) - - __impl__.__doc__ = """ - - See paddle.{}""".format(op_type) - __impl__.__name__ = op_type - - return __impl__ - def _binary_creator_(method_name, op_type, reverse=False, @@ -360,18 +339,22 @@ def monkey_patch_variable(): # a*b == b*a. Do not need to reverse explicitly ('__rmul__', _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)), + ('__div__', _binary_creator_('__div__', 'elementwise_div', False, + _scalar_div_)), + ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div', + False, _scalar_div_)), + ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True, + None)), + ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div', + True, None)), ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False, None)), ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True, None)), - # These binary use paddle.optype - ('__div__', _binary_method_creator_('divide', False)), - ('__rdiv__', _binary_method_creator_('divide', True)), - ('__truediv__', _binary_method_creator_('divide', False)), - ('__rtruediv__', _binary_method_creator_('divide', True)), - ('__floordiv__', _binary_method_creator_('floor_divide', False)), - ('__rfloordiv__', _binary_method_creator_('floor_divide', True)), - ('__mod__', _binary_method_creator_('remainder', False)), + ('__floordiv__', _binary_creator_('__floordiv__', + 'elementwise_floordiv', False, None)), + ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False, + None)), # for logical compare ('__eq__', _binary_creator_('__eq__', 'equal', False, None)), ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)), diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5a14b9fdc7b6d963f77eefc29476bc332f3938df..bc9f182d95e3b728fbc0866e1c79f5508d3a04aa 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6309,7 +6309,7 @@ def unsqueeze(input, axes, name=None): if isinstance(axes, int): axes = [axes] elif isinstance(axes, Variable): - axes = [axes.numpy().item(0)] + axes = axes.numpy().tolist() elif isinstance(axes, (list, tuple)): axes = [ item.numpy().item(0) if isinstance(item, Variable) else item diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py old mode 100644 new mode 100755 index 8b37cfef3890eace0ff5141eeb91d85e78f1c964..192effd2e42dc937fbf47efdd1d772a4c078f888 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1604,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer): & local\_learning\_rate = learning\_rate * lars\_coeff * \\ \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||} - & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param) + & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon) & param = param - velocity @@ -1628,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer): :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. name (str, optional): This parameter is used by developers to print debugging information. \ For details, please refer to :ref:`api_guide_Name`. Default is None. - + exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None. + epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0. + Examples: .. code-block:: python @@ -1659,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer): parameter_list=None, regularization=None, grad_clip=None, - name=None): + name=None, + exclude_from_weight_decay=None, + epsilon=0): assert learning_rate is not None assert momentum is not None super(LarsMomentumOptimizer, self).__init__( @@ -1672,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer): self._momentum = momentum self._lars_coeff = float(lars_coeff) self._lars_weight_decay = float(lars_weight_decay) + self._epsilon = float(epsilon) + if exclude_from_weight_decay is None: + self._exclude_from_weight_decay = [] + else: + self._exclude_from_weight_decay = exclude_from_weight_decay def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -1682,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer): def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) + _lars_weight_decay = self._lars_weight_decay + param_name = param_and_grad[0].name + if len(self._exclude_from_weight_decay) > 0: + for name in self._exclude_from_weight_decay: + if name in param_name: + _lars_weight_decay = 0.0 + break + velocity_acc = self._get_accumulator(self._velocity_acc_str, param_and_grad[0]) # create the momentum optimize op @@ -1700,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer): attrs={ "mu": self._momentum, "lars_coeff": self._lars_coeff, - "lars_weight_decay": self._lars_weight_decay + "lars_weight_decay": _lars_weight_decay, + "epsilon": self._epsilon }, stop_gradient=True) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 76c95be75d67d60cd59efe13ecba6f01a1c1d614..f2bb567b95b01eaf9a820359acef74e1c360c7f2 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -85,6 +85,30 @@ def _convert_places(places): return ret +# NOTE(chenweihang): _reader_process_loop must be top level method to be pickled +def _reader_process_loop(batch_reader, data_queue): + try: + # set signal handler + core._set_process_signal_handler() + + # NOTE: [ mmap files clear ] When the child process exits unexpectedly, + # some shared memory objects may have been applied for but have not yet + # been put into the inter-process Queue. This part of the object needs + # to be cleaned up when the process ends. + CleanupFuncRegistrar.register(_cleanup_mmap) + + for batch in batch_reader(): + tensor_list = core._convert_to_tensor_list(batch) + data_queue.put(tensor_list) + core._remove_tensor_list_mmap_fds(tensor_list) + data_queue.put(None) + except KeyboardInterrupt: + # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process + pass + except: + six.reraise(*sys.exc_info()) + + class DataLoaderBase(object): def __init__(self): self._places = None @@ -811,7 +835,8 @@ class DygraphGeneratorLoader(DataLoaderBase): global multiprocess_queue_set multiprocess_queue_set.add(self._data_queue) self._process = multiprocessing.Process( - target=self._reader_process_loop) + target=_reader_process_loop, + args=(self._batch_reader, self._data_queue)) self._process.daemon = True self._process.start() @@ -867,28 +892,6 @@ class DygraphGeneratorLoader(DataLoaderBase): self._blocking_queue.kill() logging.error("DataLoader reader thread raised an exception!") - def _reader_process_loop(self): - try: - # set signal handler - core._set_process_signal_handler() - - # NOTE: [ mmap files clear ] When the child process exits unexpectedly, - # some shared memory objects may have been applied for but have not yet - # been put into the inter-process Queue. This part of the object needs - # to be cleaned up when the process ends. - CleanupFuncRegistrar.register(_cleanup_mmap) - - for batch in self._batch_reader(): - tensor_list = core._convert_to_tensor_list(batch) - self._data_queue.put(tensor_list) - core._remove_tensor_list_mmap_fds(tensor_list) - self._data_queue.put(None) - except KeyboardInterrupt: - # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process - pass - except: - six.reraise(*sys.exc_info()) - def _reader_thread_loop_for_multiprocess(self): while not self._thread_done_event.is_set(): try: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index db472ec01662c2a93e6d7f4b2c2a577544b6a90a..102bacff9639d9f0076ac4900a89a58bdf508494 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -47,6 +47,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function) list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor) list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_distributed_strategy) list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) @@ -440,8 +441,6 @@ if(WITH_DISTRIBUTE) # FIXME(seiriosX) will fix this list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo") - list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_a_sync_optimizer_auto") - list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr") py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS}) py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS}) @@ -461,6 +460,7 @@ if(WITH_DISTRIBUTE) py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS}) py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS}) + py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy) py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS}) if(NOT WIN32) py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS}) @@ -559,8 +559,8 @@ endif() set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist test_parallel_executor_feed_persistable_var test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass - test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass - test_optimizer_in_control_flow test_dataloader_keep_order + test_data_norm_op test_imperative_using_non_zero_gpu + test_dataloader_keep_order test_dataloader_unkeep_order test_parallel_executor_fetch_isolated_var test_parallel_executor_inference_feed_partial_data diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py index eed02ea655ed406fe59d8117e4fd836dfe1bfd4e..5582a65304d3e9bad2d4621e11f8a4f312189a9a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py @@ -332,5 +332,31 @@ class TestDeclarativeAPI(unittest.TestCase): func(np.ones(5).astype("int32")) +class TestDecorateModelDirectly(unittest.TestCase): + def setUp(self): + paddle.disable_static() + program_trans.enable(True) + self.x = to_variable(np.ones([4, 10]).astype('float32')) + + def test_fake_input(self): + net = SimpleNet() + net = declarative(net) + y = net(self.x) + self.assertTrue(len(net.forward.program_cache) == 1) + + def test_input_spec(self): + net = SimpleNet() + net = declarative(net, input_spec=[InputSpec([None, 8, 10])]) + self.assertTrue(len(net.forward.inputs) == 1) + self.assertTrue(len(net.forward.program_cache) == 1) + input_shape = net.forward.inputs[0].shape + self.assertListEqual(list(input_shape), [-1, 8, 10]) + + # redecorate + net = declarative(net, input_spec=[InputSpec([None, 16, 10])]) + input_shape = net.forward.inputs[0].shape + self.assertListEqual(list(input_shape), [-1, 16, 10]) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py index 586020d434519b12c6fff4cbba812a013cf45c3d..2998ba85757e7677d5f9ab39ff81682a8b315072 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py @@ -14,15 +14,15 @@ from __future__ import print_function +import os import inspect import unittest - import numpy as np +import paddle import paddle.fluid as fluid from paddle.fluid.core import EnforceNotMet -from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData +from paddle.fluid.dygraph.dygraph_to_static import error from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap -from paddle.fluid.dygraph.jit import declarative def inner_func(): @@ -30,7 +30,7 @@ def inner_func(): return -@declarative +@paddle.jit.to_static def func_error_in_compile_time(x): x = fluid.dygraph.to_variable(x) inner_func() @@ -41,14 +41,14 @@ def func_error_in_compile_time(x): return x_v -@declarative +@paddle.jit.to_static def func_error_in_compile_time_2(x): x = fluid.dygraph.to_variable(x) x = fluid.layers.reshape(x, shape=[1, 2]) return x -@declarative +@paddle.jit.to_static def func_error_in_runtime(x, iter_num=3): x = fluid.dygraph.to_variable(x) two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32") @@ -61,6 +61,9 @@ class TestErrorInCompileTime(unittest.TestCase): self.set_func() self.set_input() self.set_exception_type() + self.prog_trans = paddle.jit.ProgramTranslator() + self.simplify_error = 1 + self.disable_error = 0 def set_func(self): self.func = func_error_in_compile_time @@ -88,14 +91,38 @@ class TestErrorInCompileTime(unittest.TestCase): for m in self.expected_message: self.assertIn(m, error_message) - def test(self): - with fluid.dygraph.guard(): - with self.assertRaises(self.exception_type) as cm: - self.func(self.input) - exception = cm.exception - error_data = getattr(exception, ERROR_DATA) - self.assertIsInstance(error_data, ErrorData) - self._test_create_message(error_data) + def _test_attach_and_raise_new_exception(self, func_call): + paddle.disable_static() + with self.assertRaises(self.exception_type) as cm: + func_call() + exception = cm.exception + + error_data = getattr(exception, error.ERROR_DATA, None) + + self.assertIsInstance(error_data, error.ErrorData) + self._test_create_message(error_data) + + def test_static_layer_call(self): + # NOTE: self.func(self.input) is the StaticLayer().__call__(self.input) + call_dy2static = lambda: self.func(self.input) + + self.set_flags(0) + self._test_attach_and_raise_new_exception(call_dy2static) + + def test_program_translator_get_output(self): + call_dy2static = lambda : self.prog_trans.get_output(unwrap(self.func), self.input) + + self.set_flags(0) + self._test_attach_and_raise_new_exception(call_dy2static) + + def set_flags(self, disable_error=0, simplify_error=1): + os.environ[error.DISABLE_ERROR_ENV_NAME] = str(disable_error) + self.disable_error = int(os.getenv(error.DISABLE_ERROR_ENV_NAME, 0)) + self.assertEqual(self.disable_error, disable_error) + + os.environ[error.SIMPLIFY_ERROR_ENV_NAME] = str(simplify_error) + self.simplify_error = int(os.getenv(error.SIMPLIFY_ERROR_ENV_NAME, 1)) + self.assertEqual(self.simplify_error, simplify_error) class TestErrorInCompileTime2(TestErrorInCompileTime): @@ -143,5 +170,28 @@ class TestErrorInRuntime(TestErrorInCompileTime): self.assertIn(m, error_message) +@unwrap +@paddle.jit.to_static() +def func_decorated_by_other_1(): + return 1 + + +@paddle.jit.to_static() +@unwrap +def func_decorated_by_other_2(): + return 1 + + +class TestErrorInOther(unittest.TestCase): + def test(self): + paddle.disable_static() + prog_trans = paddle.jit.ProgramTranslator() + with self.assertRaises(NotImplementedError): + prog_trans.get_output(func_decorated_by_other_1) + + with self.assertRaises(NotImplementedError): + func_decorated_by_other_2() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py index b03777b6ebc7f3cceb73cd32e6fdfea11755320e..3f77e9ade285e2c3d8452ea2171505442ee52fb0 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py @@ -14,6 +14,7 @@ from __future__ import print_function +import sys import unittest from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst @@ -177,8 +178,20 @@ class TestOriginInfoWithDecoratedFunc(TestOriginInfo): def set_dygraph_info(self): self.line_num = 2 - self.line_index_list = [0, 2] - self.dy_rel_lineno_list = [0, 2] + + # NOTE(liym27): + # There are differences in ast_node.lineno between PY3.8+ and PY3.8-. + # If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs. + # 1. < PY3.8 + # its lineno equals to the lineno of the first decorator node, which is not right. + # 2. >= PY3.8 + # its lineno is the actual lineno, which is right. + if sys.version_info >= (3, 8): + self.line_index_list = [1, 2] + self.dy_rel_lineno_list = [1, 2] + else: + self.line_index_list = [0, 2] + self.dy_rel_lineno_list = [0, 2] self.dy_abs_col_offset = [0, 4] self.dy_func_name = [self.dygraph_func.__name__] * self.line_num @@ -199,8 +212,13 @@ class TestOriginInfoWithDecoratedFunc2(TestOriginInfo): def set_dygraph_info(self): self.line_num = 2 - self.line_index_list = [0, 3] - self.dy_rel_lineno_list = [0, 3] + + if sys.version_info >= (3, 8): + self.line_index_list = [2, 3] + self.dy_rel_lineno_list = [2, 3] + else: + self.line_index_list = [0, 3] + self.dy_rel_lineno_list = [0, 3] self.dy_abs_col_offset = [0, 4] self.dy_func_name = [self.dygraph_func.__name__] * self.line_num diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py index f0fbe54f9dbbf93121655e784601467c13b3a70d..91067f360995e1661c200df923a698f3f146b71e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py @@ -14,6 +14,7 @@ from __future__ import print_function import numpy as np +import paddle import paddle.fluid as fluid from paddle.fluid.layers.utils import flatten from paddle.fluid.dygraph import declarative, ProgramTranslator @@ -151,5 +152,33 @@ class TestWithTrainAndEval(unittest.TestCase): partial_layer._train_program) +class GPT2LMHeadModel(fluid.dygraph.Layer): + def __init__(self): + super(GPT2LMHeadModel, self).__init__() + self.embedding0 = paddle.nn.Embedding(20, 16) + self.embedding1 = paddle.nn.Embedding(20, 32) + self.lm_head_weight = paddle.to_tensor( + np.random.rand(2, 3).astype('float32')) + + @declarative + def forward(self, x): + x = fluid.layers.reshape(x, shape=[-1, 6]) + x1, x2, x3 = fluid.layers.split(input=x, dim=1, num_or_sections=3) + return x1 + + +class TestPruneUnusedParamInProgram(unittest.TestCase): + def test_prune(self): + input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32") + + place = fluid.CPUPlace() + with fluid.dygraph.guard(place): + model = GPT2LMHeadModel() + model.eval() + input_ids = paddle.to_tensor(input_ids) + out = model(input_ids) + self.assertTrue(np.array_equal(out.numpy(), [[15, 11]])) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ff4531f0e250e325f39ef69161c8d1ee751a2336 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py @@ -0,0 +1,145 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru +from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION + + +class TestFusionGRUINT8MKLDNNOp(OpTest): + def set_confs(self): + pass + + def setUp(self): + self.op_type = "fusion_gru" + self.lod = [[2, 4, 3]] + self.IC = 3 + self.OC = 5 + self.is_reverse = False + self.with_h0 = False + self.with_bias = True + self.act_state = 'tanh' + self.act_gate = 'sigmoid' + self.origin_mode = True + self.use_mkldnn = True + self.force_fp32_output = True + self.error_margin = 1e-5 + self.set_confs() + + # RNN dimensions + T = sum(self.lod[0]) + N = len(self.lod[0]) + + # Input data + x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1 + scale_data = 63 + shift_data = 64 + x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8) + + # WeightX/WeightH data + wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1 + wh = np.random.rand(self.OC, 3 * self.OC).astype('float32') * 2 - 1 + + # Calculating weight scales + # scales = 63 / max(abs(channel_wise(weightsX + weightsH))) + # WeightX data shape in PP: [IC, 3 * OC] + # WeightH data shape in PP: [OC, 2 * OC] + [OC, OC] + # Scales shape in oneDNN: [3, OC] + scale_ur = 63 / np.max(np.abs( + np.concatenate( + [ + wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC] + .reshape(self.OC, 2 * self.OC) + ], + axis=0)), + axis=0) + scale_o = 63 / np.max(np.abs( + np.concatenate( + [ + wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:] + .reshape(self.OC, self.OC) + ], + axis=0)), + axis=0) + + scale_weights = np.concatenate([scale_ur, scale_o]).astype('float') + + bias = np.random.rand( + 1, 3 * self.OC).astype('float32') if self.with_bias else np.zeros( + (1, 3 * self.OC), dtype='float32') + h0 = np.random.rand( + N, self.OC).astype('float32') if self.with_h0 else np.zeros( + (N, self.OC), dtype='float32') + + _, _, _, hidden_f32 = fusion_gru(x_f32, self.lod, h0, wx, wh, bias, + self.is_reverse, self.origin_mode, + ACTIVATION[self.act_state], + ACTIVATION[self.act_gate]) + + self.inputs = {'X': (x_u8, self.lod), 'WeightX': wx, 'WeightH': wh} + + if self.with_bias: + self.inputs['Bias'] = bias + + if self.with_h0: + self.inputs['H0'] = h0 + + if self.force_fp32_output: + self.error_margin = 1e-1 + self.outputs = {'Hidden': (hidden_f32, self.lod)} + else: + self.error_margin = 1 + hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8) + self.outputs = {'Hidden': (hidden_u8, self.lod)} + + self.attrs = { + 'activation': self.act_state, + 'gate_activation': self.act_gate, + 'is_reverse': self.is_reverse, + 'origin_mode': self.origin_mode, + 'use_mkldnn': self.use_mkldnn, + 'force_fp32_output': self.force_fp32_output, + 'Scale_data': scale_data, + 'Shift_data': shift_data, + 'Scale_weights': scale_weights + } + + def test_check_output(self): + self.check_output(check_dygraph=False, atol=self.error_margin) + + +class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp): + def set_confs(self): + self.force_fp32_output = False + + +class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUINT8MKLDNNOp): + def set_confs(self): + self.origin_mode = False + + +class TestFusionGRUINT8MKLDNNOp4(TestFusionGRUINT8MKLDNNOp): + def set_confs(self): + self.with_bias = False + + +class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp): + def set_confs(self): + self.with_h0 = False + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 14e83fccd655527d8f3012365e4757d23236a445..47bf8f49e39b6451ee480d461e83324b89cacee2 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -448,7 +448,6 @@ class TestAdamOpV2(unittest.TestCase): def test_adam_op_with_state_dict(self): - import paddle paddle.disable_static() emb = paddle.nn.Embedding(10, 10) @@ -517,6 +516,20 @@ class TestAdamOpV2(unittest.TestCase): adam = paddle.optimizer.Adam( 0.1, epsilon=-1, parameters=linear.parameters()) + def test_adam_op_with_sparse_input_and_weight_decay(self): + + paddle.disable_static() + x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64) + x = paddle.to_tensor(x_data, stop_gradient=False) + emb = paddle.nn.Embedding(10, 10, sparse=True) + adam = paddle.optimizer.Adam( + 0.001, parameters=emb.parameters(), weight_decay=0.01) + + with self.assertRaises(RuntimeError): + out = emb(x) + out.backward() + adam.step() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py index 70863d3857c43c84a583f0ccf7b9bd733fdb4fd0..fbacaa3d5ce10bdad6dd87fdfc04c1173aff18ff 100644 --- a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py @@ -18,9 +18,9 @@ from op_test import OpTest, skip_check_grad_ci import paddle.fluid as fluid -class TestAmpCheckFiniteAndScaleOp(OpTest): +class TestCheckFiniteAndUnscaleOp(OpTest): def setUp(self): - self.op_type = "amp_check_finite_and_scale" + self.op_type = "check_finite_and_unscale" self.init_dtype() x = np.random.random((1024, 1024)).astype(self.dtype) scale = np.random.random((1)).astype(self.dtype) @@ -28,7 +28,7 @@ class TestAmpCheckFiniteAndScaleOp(OpTest): self.inputs = {'X': [('x0', x)], 'Scale': scale} self.outputs = { 'FoundInfinite': np.array([0]), - 'Out': [('out0', x * scale)], + 'Out': [('out0', x / scale)], } def init_dtype(self): @@ -38,9 +38,9 @@ class TestAmpCheckFiniteAndScaleOp(OpTest): self.check_output() -class TestAmpCheckFiniteAndScaleOpWithNan(OpTest): +class TestCheckFiniteAndUnscaleOpWithNan(OpTest): def setUp(self): - self.op_type = "amp_check_finite_and_scale" + self.op_type = "check_finite_and_unscale" self.init_dtype() x = np.random.random((1024, 1024)).astype(self.dtype) x[128][128] = np.nan @@ -61,9 +61,9 @@ class TestAmpCheckFiniteAndScaleOpWithNan(OpTest): self.check_output(no_check_set=['Out']) -class TestAmpCheckFiniteAndScaleOpWithInf(OpTest): +class TestCheckFiniteAndUnscaleOpWithInf(OpTest): def setUp(self): - self.op_type = "amp_check_finite_and_scale" + self.op_type = "check_finite_and_unscale" self.init_dtype() x = np.random.random((1024, 1024)).astype(self.dtype) x[128][128] = np.inf diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py index 3e8c449d8995ca90401861e93f2fb987d1c6967d..fdfaf6a3113bbb9a50a79de7ef4ac4c3251d5759 100644 --- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest - +import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.backward import calc_gradient @@ -81,5 +81,22 @@ class TestDoubleGrad(unittest.TestCase): self.assertEqual(12, out[0]) +class TestGradientWithPrune(unittest.TestCase): + def test_prune(self): + x = fluid.data(name='x', shape=[3], dtype='float32') + x.stop_gradient = False + x1, x2, x3 = fluid.layers.split(x, dim=0, num_or_sections=3) + y = x1 * 2 + x1_grad = fluid.gradients(y, x) + + exe = fluid.Executor(fluid.CPUPlace()) + main = fluid.default_main_program() + exe.run(fluid.default_startup_program()) + out = exe.run(main, + feed={'x': np.ones([3]).astype('float32')}, + fetch_list=[x1_grad]) + self.assertTrue(np.array_equal(out[0], [2., 0., 0.])) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py index cfad50409802d4f3d35c9da3b22597c681da91b1..25ae65aa7c968b2e6f1f9429d1a4e4e618fe7033 100644 --- a/python/paddle/fluid/tests/unittests/test_compare_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_op.py @@ -38,6 +38,7 @@ def create_test_class(op_type, typename, callback): self.check_output() def test_errors(self): + paddle.enable_static() with program_guard(Program(), Program()): x = fluid.layers.data(name='x', shape=[2], dtype='int32') y = fluid.layers.data(name='y', shape=[2], dtype='int32') @@ -80,6 +81,7 @@ def create_paddle_case(op_type, callback): self.place = paddle.CUDAPlace(0) def test_api(self): + paddle.enable_static() with program_guard(Program(), Program()): x = fluid.data(name='x', shape=[4], dtype='int64') y = fluid.data(name='y', shape=[4], dtype='int64') @@ -92,6 +94,7 @@ def create_paddle_case(op_type, callback): self.assertEqual((res == self.real_result).all(), True) def test_broadcast_api_1(self): + paddle.enable_static() with program_guard(Program(), Program()): x = paddle.static.data( name='x', shape=[1, 2, 1, 3], dtype='int32') @@ -108,6 +111,7 @@ def create_paddle_case(op_type, callback): self.assertEqual((res == real_result).all(), True) def test_attr_name(self): + paddle.enable_static() with program_guard(Program(), Program()): x = fluid.layers.data(name='x', shape=[4], dtype='int32') y = fluid.layers.data(name='y', shape=[4], dtype='int32') @@ -130,6 +134,7 @@ create_paddle_case('not_equal', lambda _a, _b: _a != _b) class TestCompareOpError(unittest.TestCase): def test_errors(self): + paddle.enable_static() with program_guard(Program(), Program()): # The input x and y of compare_op must be Variable. x = fluid.layers.data(name='x', shape=[1], dtype="float32") @@ -140,6 +145,7 @@ class TestCompareOpError(unittest.TestCase): class API_TestElementwise_Equal(unittest.TestCase): def test_api(self): + paddle.enable_static() with fluid.program_guard(fluid.Program(), fluid.Program()): label = fluid.layers.assign(np.array([3, 3], dtype="int32")) limit = fluid.layers.assign(np.array([3, 2], dtype="int32")) @@ -159,5 +165,31 @@ class API_TestElementwise_Equal(unittest.TestCase): self.assertEqual((res == np.array([True, True])).all(), True) +class TestCompareOpPlace(unittest.TestCase): + def test_place_1(self): + paddle.enable_static() + place = paddle.CPUPlace() + if core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + label = fluid.layers.assign(np.array([3, 3], dtype="int32")) + limit = fluid.layers.assign(np.array([3, 2], dtype="int32")) + out = fluid.layers.less_than(label, limit, force_cpu=True) + exe = fluid.Executor(place) + res, = exe.run(fetch_list=[out]) + self.assertEqual((res == np.array([False, False])).all(), True) + + def test_place_2(self): + place = paddle.CPUPlace() + data_place = place + if core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + data_place = paddle.CUDAPinnedPlace() + paddle.disable_static(place) + data = np.array([9], dtype="int64") + data_tensor = paddle.to_tensor(data, place=data_place) + result = data_tensor == 0 + self.assertEqual((result.numpy() == np.array([False])).all(), True) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py index ab47659a88de44f60291ef056c0fd5ed2e01b5f2..5a5d8afc55bac4c0ea862e75b728c6c1a37b3188 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py @@ -62,82 +62,6 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): a_sync_configs = optimizer.user_defined_strategy.a_sync_configs self.assertTrue(a_sync_configs['k_steps'] == 0) - def test_a_sync_optimizer2(self): - os.environ["TRAINING_ROLE"] = "TRAINER" - import paddle.distributed.fleet as fleet - - main_program = paddle.fluid.Program() - startup_program = paddle.fluid.Program() - - paddle.fluid.framework.switch_main_program(main_program) - paddle.fluid.framework.switch_startup_program(startup_program) - - fleet.init(role_maker.PaddleCloudRoleMaker()) - input_x = paddle.fluid.layers.data( - name="x", shape=[32], dtype='float32') - input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') - - fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) - - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.auto = True - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) - - self.assertTrue(optimizer.user_defined_strategy.a_sync) - a_sync_configs = optimizer.user_defined_strategy.a_sync_configs - self.assertTrue(a_sync_configs['k_steps'] == 800) - - def test_a_sync_optimizer3(self): - os.environ["TRAINING_ROLE"] = "TRAINER" - import paddle.distributed.fleet as fleet - - main_program = paddle.fluid.Program() - startup_program = paddle.fluid.Program() - - paddle.fluid.framework.switch_main_program(main_program) - paddle.fluid.framework.switch_startup_program(startup_program) - - fleet.init(role_maker.PaddleCloudRoleMaker()) - input_x = paddle.fluid.layers.data( - name="x", - shape=[-1, 1], - dtype="int64", - lod_level=1, - append_batch_size=False) - x_embedding = paddle.fluid.layers.embedding( - is_distributed=False, - input=input_x, - size=[1000000000, 100000], - param_attr=paddle.fluid.ParamAttr( - name="embedding", - initializer=paddle.fluid.initializer.Constant(value=0.01)), - is_sparse=True) - input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') - - fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) - - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.auto = True - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) - - self.assertTrue(optimizer.user_defined_strategy.a_sync) - a_sync_configs = optimizer.user_defined_strategy.a_sync_configs - self.assertTrue(a_sync_configs['k_steps'] == 0) - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py new file mode 100644 index 0000000000000000000000000000000000000000..9085556c04c356e5b703ec0b36c3884100ad73f8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import os +import paddle.distributed.fleet.base.role_maker as role_maker +import time + + +class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_PSERVER_NUMS"] = "2" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_PORT"] = "36001" + os.environ["PADDLE_TRAINER_ID"] = "0" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ + "127.0.0.1:36001,127.0.0.2:36001" + + def test_a_sync_optimizer3(self): + os.environ["TRAINING_ROLE"] = "TRAINER" + import paddle.distributed.fleet as fleet + + main_program = paddle.fluid.Program() + startup_program = paddle.fluid.Program() + + paddle.fluid.framework.switch_main_program(main_program) + paddle.fluid.framework.switch_startup_program(startup_program) + + fleet.init(role_maker.PaddleCloudRoleMaker()) + input_x = paddle.fluid.layers.data( + name="x", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + x_embedding = paddle.fluid.layers.embedding( + is_distributed=False, + input=input_x, + size=[1000000000, 100000], + param_attr=paddle.fluid.ParamAttr( + name="embedding", + initializer=paddle.fluid.initializer.Constant(value=0.01)), + is_sparse=True) + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.auto = True + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + self.assertTrue(optimizer.user_defined_strategy.a_sync) + a_sync_configs = optimizer.user_defined_strategy.a_sync_configs + self.assertTrue(a_sync_configs['k_steps'] == 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py new file mode 100644 index 0000000000000000000000000000000000000000..4787d048bd2566fe063073867bcbd4138d25ff21 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py @@ -0,0 +1,67 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import os +import paddle.distributed.fleet.base.role_maker as role_maker +import time + + +class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_PSERVER_NUMS"] = "2" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_PORT"] = "36001" + os.environ["PADDLE_TRAINER_ID"] = "0" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ + "127.0.0.1:36001,127.0.0.2:36001" + + def test_a_sync_optimizer2(self): + os.environ["TRAINING_ROLE"] = "TRAINER" + import paddle.distributed.fleet as fleet + + main_program = paddle.fluid.Program() + startup_program = paddle.fluid.Program() + + paddle.fluid.framework.switch_main_program(main_program) + paddle.fluid.framework.switch_startup_program(startup_program) + + fleet.init(role_maker.PaddleCloudRoleMaker()) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.auto = True + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + self.assertTrue(optimizer.user_defined_strategy.a_sync) + a_sync_configs = optimizer.user_defined_strategy.a_sync_configs + self.assertTrue(a_sync_configs['k_steps'] == 800) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py index 1062123948481a4164a12a4bed818b964923006f..761d57408b9a8f9e52419331bfb0bca5b0135c30 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py @@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase): ["listen_and_serv"]) # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale self.assertEqual([op.type for op in pserver.blocks[1].ops], [ - "sum", "cast", "fill_constant", "elementwise_div", "floor", - "fill_constant", "elementwise_pow", "scale" + "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow", + "scale" ]) # block1~2: optimize pass diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 9ebaf8ff9438be8c8a57815be0798b861d05caaf..3cfbac8b613c125956861f73b1bab24c34e05572 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -240,124 +240,25 @@ class TestElementwiseDivBroadcast(unittest.TestCase): self.assertEqual((out_result == (2 / x)).all(), True) -class TestDivideAPI(unittest.TestCase): - def setUp(self): - paddle.set_default_dtype("float64") - self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(fluid.CUDAPlace(0)) - - def check_static_result(self, place): - # rule 1 - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = np.array([1, 2, 3]) - self.assertRaises(TypeError, paddle.divide, x=x, y=y) - - # rule 2: both the inputs are not Tensor - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = 2 - y = 4 - res = paddle.divide(x, y) - exe = fluid.Executor(place) - np_z = exe.run(fluid.default_main_program(), - feed={}, - fetch_list=[res]) - self.assertEqual(np_z[0] == 0.5, True) - - # rule 3: - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = fluid.data(name="y", shape=[3], dtype="float32") - self.assertRaises(TypeError, paddle.divide, x=x, y=y) - - # rule 4: x is Tensor, y is scalar - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = 2 - exe = fluid.Executor(place) - res = x / y - np_z = exe.run(fluid.default_main_program(), - feed={"x": np.array([2, 3, 4]).astype('float64')}, - fetch_list=[res]) - z_expected = np.array([1., 1.5, 2.]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - # rule 5: y is Tensor, x is scalar - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = 2 - exe = fluid.Executor(place) - res = y / x - np_z = exe.run(fluid.default_main_program(), - feed={"x": np.array([2, 8, 4]).astype('float64')}, - fetch_list=[res]) - z_expected = np.array([1., 0.25, 0.5]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - # rule 6: y is Tensor, x is Tensor - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = fluid.data(name="y", shape=[3], dtype="float64") - exe = fluid.Executor(place) - res = x / y - np_z = exe.run(fluid.default_main_program(), - feed={ - "x": np.array([2, 3, 4]).astype('float64'), - "y": np.array([1, 5, 2]).astype('float64') - }, - fetch_list=[res]) - z_expected = np.array([2., 0.6, 2.]) - self.assertEqual((np_z[0] == z_expected).all(), True) +class TestDivideOp(unittest.TestCase): + def test_name(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name="x", shape=[2, 3], dtype="float32") + y = fluid.data(name='y', shape=[2, 3], dtype='float32') - def test_static(self): - for place in self.places: - self.check_static_result(place=place) + y_1 = paddle.divide(x, y, name='div_res') + self.assertEqual(('div_res' in y_1.name), True) def test_dygraph(self): - for place in self.places: - with fluid.dygraph.guard(place): - # rule 1 : avoid numpy.ndarray - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x) - self.assertRaises(TypeError, paddle.divide, x=x, y=np_y) - - # rule 2: both the inputs are not Tensor - z = paddle.divide(3, 2) - self.assertEqual(z.numpy()[0] == 1.5, True) - - # rule 3: both the inputs are Tensor - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x, dtype="float32") - y = paddle.to_tensor(np_y, dtype="float64") - self.assertRaises(TypeError, paddle.divide, x=x, y=y) - - # rule 4: x is Tensor, y is scalar - np_x = np.array([2, 3, 4]) - x = paddle.to_tensor(np_x, dtype="int32") - y = 2 - z = x / y - z_expected = np.array([1., 1.5, 2.]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - # rule 5: y is Tensor, x is scalar - np_x = np.array([2, 1, 4]) - x = paddle.to_tensor(np_x, dtype="int32") - y = 2 - z = y / x - z_expected = np.array([1., 2., 0.5]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - # rule 6: y is Tensor, x is Tensor - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x) - y = paddle.to_tensor(np_y) - z = x / y - z_expected = np.array([2., 0.6, 2.]) - self.assertEqual((z_expected == z.numpy()).all(), True) + with fluid.dygraph.guard(): + np_x = np.array([2, 3, 4]).astype('float64') + np_y = np.array([1, 5, 2]).astype('float64') + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = paddle.divide(x, y) + np_z = z.numpy() + z_expected = np.array([2., 0.6, 2.]) + self.assertEqual((np_z == z_expected).all(), True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py index 0b6acc7615395ed99a484e0e56f9c62447a1f345..f339081e31b87b8d5584fd4f866e0aaf6f391ea7 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py @@ -58,13 +58,6 @@ class TestElementwiseModOp(OpTest): pass -class TestElementwiseModOpInverse(TestElementwiseModOp): - def init_input_output(self): - self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype) - self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype) - self.out = np.floor_divide(self.x, self.y) - - class TestElementwiseModOp_scalar(TestElementwiseModOp): def init_input_output(self): scale_x = random.randint(0, 100000000) @@ -74,124 +67,25 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp): self.out = np.floor_divide(self.x, self.y) -class TestFloorDivideAPI(unittest.TestCase): - def setUp(self): - paddle.set_default_dtype("float64") - self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(fluid.CUDAPlace(0)) - - def check_static_result(self, place): - # rule 1 - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = np.array([1, 2, 3]) - self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y) - - # rule 2: both the inputs are not Tensor - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = 2 - y = 4 - res = paddle.floor_divide(x, y) - exe = fluid.Executor(place) - np_z = exe.run(fluid.default_main_program(), - feed={}, - fetch_list=[res]) - self.assertEqual(np_z[0] == 0., True) - - # rule 3: - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = fluid.data(name="y", shape=[3], dtype="float32") - self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y) - - # rule 4: x is Tensor, y is scalar - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = 2 - exe = fluid.Executor(place) - res = x // y - np_z = exe.run(fluid.default_main_program(), - feed={"x": np.array([2, 3, 4]).astype('float64')}, - fetch_list=[res]) - z_expected = np.array([1., 1., 2.]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - # rule 5: y is Tensor, x is scalar - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = 2 - exe = fluid.Executor(place) - res = y // x - np_z = exe.run(fluid.default_main_program(), - feed={"x": np.array([2, 8, 4]).astype('float64')}, - fetch_list=[res]) - z_expected = np.array([1., 0., 0.]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - # rule 6: y is Tensor, x is Tensor - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = fluid.data(name="y", shape=[3], dtype="float64") - exe = fluid.Executor(place) - res = x // y - np_z = exe.run(fluid.default_main_program(), - feed={ - "x": np.array([2, 3, 4]).astype('float64'), - "y": np.array([1, 5, 2]).astype('float64') - }, - fetch_list=[res]) - z_expected = np.array([2., 0., 2.]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - def test_static(self): - for place in self.places: - self.check_static_result(place=place) +class TestFloorDivideOp(unittest.TestCase): + def test_name(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name="x", shape=[2, 3], dtype="int64") + y = fluid.data(name='y', shape=[2, 3], dtype='int64') + + y_1 = paddle.floor_divide(x, y, name='div_res') + self.assertEqual(('div_res' in y_1.name), True) def test_dygraph(self): - for place in self.places: - with fluid.dygraph.guard(place): - # rule 1 : avoid numpy.ndarray - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x) - self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y) - - # rule 2: both the inputs are not Tensor - z = paddle.floor_divide(3, 2) - self.assertEqual(z.numpy()[0] == 1., True) - - # rule 3: both the inputs are Tensor - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x, dtype="float32") - y = paddle.to_tensor(np_y, dtype="float64") - self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y) - - # rule 4: x is Tensor, y is scalar - np_x = np.array([2, 3, 4]) - x = paddle.to_tensor(np_x, dtype="int32") - y = 2 - z = x // y - z_expected = np.array([1, 1, 2]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - # rule 5: y is Tensor, x is scalar - np_x = np.array([2, 1, 4]) - x = paddle.to_tensor(np_x, dtype="int32") - y = 2 - z = y // x - z_expected = np.array([1, 2, 0]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - # rule 6: y is Tensor, x is Tensor - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x) - y = paddle.to_tensor(np_y) - z = x // y - z_expected = np.array([2., 0., 2.]) - self.assertEqual((z_expected == z.numpy()).all(), True) + with fluid.dygraph.guard(): + np_x = np.array([2, 3, 8, 7]).astype('int64') + np_y = np.array([1, 5, 3, 3]).astype('int64') + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = paddle.floor_divide(x, y) + np_z = z.numpy() + z_expected = np.array([2, 0, 2, 2]) + self.assertEqual((np_z == z_expected).all(), True) with fluid.dygraph.guard(fluid.CPUPlace()): # divide by zero diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py index cab6160d761004877896deea8d44ca02c9de2e1e..2a8ca51693ecfad55f2239d7619e355c6dd7f3f8 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py @@ -84,149 +84,41 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat): self.dtype = np.float64 -class TestRemainderAPI(unittest.TestCase): - def setUp(self): - paddle.set_default_dtype("float64") - self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - self.places.append(fluid.CUDAPlace(0)) - - def check_static_result(self, place): - # rule 1 - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = np.array([1, 2, 3]) - self.assertRaises(TypeError, paddle.remainder, x=x, y=y) - - # rule 3: - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = fluid.data(name="y", shape=[3], dtype="float32") - self.assertRaises(TypeError, paddle.remainder, x=x, y=y) - - # rule 4: x is Tensor, y is scalar - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = 2 - exe = fluid.Executor(place) - res = x % y - np_z = exe.run(fluid.default_main_program(), - feed={"x": np.array([2, 3, 4]).astype('float64')}, - fetch_list=[res]) - z_expected = np.array([0., 1., 0.]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - # rule 5: y is Tensor, x is scalar - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = 3 - y = fluid.data(name="y", shape=[3], dtype="float32") - self.assertRaises(TypeError, paddle.remainder, x=x, y=y) - - # rule 6: y is Tensor, x is Tensor - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[3], dtype="float64") - y = fluid.data(name="y", shape=[1], dtype="float64") - exe = fluid.Executor(place) - res = x % y - np_z = exe.run(fluid.default_main_program(), - feed={ - "x": np.array([1., 2., 4]).astype('float64'), - "y": np.array([1.5]).astype('float64') - }, - fetch_list=[res]) - z_expected = np.array([1., 0.5, 1.0]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - # rule 6: y is Tensor, x is Tensor - with fluid.program_guard(fluid.Program(), fluid.Program()): - x = fluid.data(name="x", shape=[6], dtype="float64") - y = fluid.data(name="y", shape=[1], dtype="float64") - exe = fluid.Executor(place) - res = x % y - np_z = exe.run( - fluid.default_main_program(), - feed={ - "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'), - "y": np.array([2]).astype('float64') - }, - fetch_list=[res]) - z_expected = np.array([1., 0., 1., 1., 0., 1.]) - self.assertEqual((np_z[0] == z_expected).all(), True) - - def test_static(self): - for place in self.places: - self.check_static_result(place=place) +class TestRemainderOp(unittest.TestCase): + def test_name(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name="x", shape=[2, 3], dtype="int64") + y = fluid.data(name='y', shape=[2, 3], dtype='int64') + + y_1 = paddle.remainder(x, y, name='div_res') + self.assertEqual(('div_res' in y_1.name), True) def test_dygraph(self): - for place in self.places: - with fluid.dygraph.guard(place): - # rule 1 : avoid numpy.ndarray - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x) - self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y) - - # rule 3: both the inputs are Tensor - np_x = np.array([2, 3, 4]) - np_y = np.array([1, 5, 2]) - x = paddle.to_tensor(np_x, dtype="float32") - y = paddle.to_tensor(np_y, dtype="float64") - self.assertRaises(TypeError, paddle.remainder, x=x, y=y) - - # rule 4: x is Tensor, y is scalar - np_x = np.array([2, 3, 4]) - x = paddle.to_tensor(np_x, dtype="int32") - y = 2 - z = x % y - z_expected = np.array([0, 1, 0]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - # rule 5: y is Tensor, x is scalar - np_x = np.array([2, 3, 4]) - x = paddle.to_tensor(np_x) - self.assertRaises(TypeError, paddle.remainder, x=3, y=x) - - # rule 6: y is Tensor, x is Tensor - np_x = np.array([1., 2., 4]) - np_y = np.array([1.5]) - x = paddle.to_tensor(np_x) - y = paddle.to_tensor(np_y) - z = x % y - z_expected = np.array([1., 0.5, 1.0]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - # rule 6: y is Tensor, x is Tensor - np_x = np.array([-3., -2, -1, 1, 2, 3]) - np_y = np.array([2.]) - x = paddle.to_tensor(np_x) - y = paddle.to_tensor(np_y) - z = x % y - z_expected = np.array([1., 0., 1., 1., 0., 1.]) - self.assertEqual((z_expected == z.numpy()).all(), True) - - np_x = np.array([-3.3, 11.5, -2, 3.5]) - np_y = np.array([-1.2, 2., 3.3, -2.3]) - x = paddle.to_tensor(np_x) - y = paddle.to_tensor(np_y) - z = x % y - z_expected = np.array([-0.9, 1.5, 1.3, -1.1]) - self.assertEqual(np.allclose(z_expected, z.numpy()), True) - - np_x = np.array([-3, 11, -2, 3]) - np_y = np.array([-1, 2, 3, -2]) - x = paddle.to_tensor(np_x, dtype="int64") - y = paddle.to_tensor(np_y, dtype="int64") - z = x % y - z_expected = np.array([0, 1, 1, -1]) - self.assertEqual(np.allclose(z_expected, z.numpy()), True) - - np_x = np.array([-3, 3]) - np_y = np.array([[2, 3], [-2, -1]]) - x = paddle.to_tensor(np_x, dtype="int64") - y = paddle.to_tensor(np_y, dtype="int64") - z = x % y - z_expected = np.array([[1, 0], [-1, 0]]) - self.assertEqual(np.allclose(z_expected, z.numpy()), True) + with fluid.dygraph.guard(): + np_x = np.array([2, 3, 8, 7]).astype('int64') + np_y = np.array([1, 5, 3, 3]).astype('int64') + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = paddle.remainder(x, y) + np_z = z.numpy() + z_expected = np.array([0, 3, 2, 1]) + self.assertEqual((np_z == z_expected).all(), True) + + np_x = np.array([-3.3, 11.5, -2, 3.5]) + np_y = np.array([-1.2, 2., 3.3, -2.3]) + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = x % y + z_expected = np.array([-0.9, 1.5, 1.3, -1.1]) + self.assertEqual(np.allclose(z_expected, z.numpy()), True) + + np_x = np.array([-3, 11, -2, 3]) + np_y = np.array([-1, 2, 3, -2]) + x = paddle.to_tensor(np_x, dtype="int64") + y = paddle.to_tensor(np_y, dtype="int64") + z = x % y + z_expected = np.array([0, 1, 1, -1]) + self.assertEqual(np.allclose(z_expected, z.numpy()), True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e8b1f836fcaa8d53671307d9075efd45fc88ce7b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_empty_op.py @@ -0,0 +1,270 @@ +#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest +from paddle.fluid import Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + + +# Situation 1: Attr(shape) is a list(without tensor) +class TestEmptyOp(OpTest): + def setUp(self): + self.op_type = "empty" + self.init_config() + + def test_check_output(self): + self.check_output_customized(self.verify_output) + + def verify_output(self, outs): + data_type = outs[0].dtype + if data_type in ['float32', 'float64', 'int32', 'int64']: + max_value = np.nanmax(outs[0]) + min_value = np.nanmin(outs[0]) + + always_full_zero = max_value == 0.0 and min_value == 0.0 + always_non_full_zero = max_value > min_value + self.assertTrue(always_full_zero or always_non_full_zero, + 'always_full_zero or always_non_full_zero.') + elif data_type in ['bool']: + total_num = outs[0].size + true_num = np.sum(outs[0] == True) + false_num = np.sum(outs[0] == False) + self.assertTrue(total_num == true_num + false_num, + 'The value should always be True or False.') + else: + self.assertTrue(False, 'invalid data type') + + def init_config(self): + shape = [500, 3] + dtype = 'float32' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + self.attrs = {'shape': shape, 'dtype': dtype_inner} + self.inputs = {} + self.outputs = {'Out': np.zeros(shape).astype(dtype)} + + +class TestEmptyOp2(TestEmptyOp): + def init_config(self): + shape = [500, 3] + dtype = 'float64' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + self.attrs = {'shape': shape, 'dtype': dtype_inner} + self.inputs = {} + self.outputs = {'Out': np.zeros(shape).astype(dtype)} + + +class TestEmptyOp3(TestEmptyOp): + def init_config(self): + shape = [500, 3] + dtype = 'int32' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + self.attrs = {'shape': shape, 'dtype': dtype_inner} + self.inputs = {} + self.outputs = {'Out': np.zeros(shape).astype(dtype)} + + +class TestEmptyOp4(TestEmptyOp): + def init_config(self): + shape = [500, 3] + dtype = 'int64' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + self.attrs = {'shape': shape, 'dtype': dtype_inner} + self.inputs = {} + self.outputs = {'Out': np.zeros(shape).astype(dtype)} + + +class TestEmptyOp5(TestEmptyOp): + def init_config(self): + shape = [500, 3] + dtype = 'bool' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + self.attrs = {'shape': shape, 'dtype': dtype_inner} + self.inputs = {} + self.outputs = {'Out': np.zeros(shape).astype(dtype)} + + +# Situation 2: shape is a tensor +class TestEmptyOp_ShapeTensor(OpTest): + def setUp(self): + self.op_type = "empty" + self.init_config() + + def init_config(self): + self.shape = [500, 3] + dtype = 'float32' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + self.attrs = {'dtype': dtype_inner} + self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")} + self.outputs = {'Out': np.zeros(self.shape).astype(dtype)} + + def test_check_output(self): + self.check_output_customized(self.verify_output) + + def verify_output(self, outs): + data_type = outs[0].dtype + if data_type in ['float32', 'float64', 'int32', 'int64']: + max_value = np.nanmax(outs[0]) + min_value = np.nanmin(outs[0]) + + always_full_zero = max_value == 0.0 and min_value == 0.0 + always_non_full_zero = max_value > min_value + self.assertTrue(always_full_zero or always_non_full_zero, + 'always_full_zero or always_non_full_zero.') + elif data_type in ['bool']: + total_num = outs[0].size + true_num = np.sum(outs[0] == True) + false_num = np.sum(outs[0] == False) + self.assertTrue(total_num == true_num + false_num, + 'The value should always be True or False.') + else: + self.assertTrue(False, 'invalid data type') + + +# Situation 3: Attr(shape) is a list(with tensor) +class TestEmptyOp_ShapeTensorList(OpTest): + def setUp(self): + self.op_type = "empty" + self.init_config() + + def init_config(self): + self.shape = [123, 92] + self.infer_shape = [-1, 92] + + dtype = 'float32' + dtype_inner = convert_np_dtype_to_dtype_(dtype) + + shape_tensor_list = [] + for index, ele in enumerate(self.shape): + shape_tensor_list.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = {"ShapeTensorList": shape_tensor_list} + self.attrs = {'shape': self.infer_shape, 'dtype': dtype_inner} + self.outputs = {'Out': np.zeros(self.shape).astype(dtype)} + + def test_check_output(self): + self.check_output_customized(self.verify_output) + + def verify_output(self, outs): + data_type = outs[0].dtype + if data_type in ['float32', 'float64', 'int32', 'int64']: + max_value = np.nanmax(outs[0]) + min_value = np.nanmin(outs[0]) + + always_full_zero = max_value == 0.0 and min_value == 0.0 + always_non_full_zero = max_value > min_value + self.assertTrue(always_full_zero or always_non_full_zero, + 'always_full_zero or always_non_full_zero.') + elif data_type in ['bool']: + total_num = outs[0].size + true_num = np.sum(outs[0] == True) + false_num = np.sum(outs[0] == False) + self.assertTrue(total_num == true_num + false_num, + 'The value should always be True or False.') + else: + self.assertTrue(False, 'invalid data type') + + +class TestEmptyAPI(unittest.TestCase): + def __check_out__(self, out, dtype='float32'): + max_value = np.nanmax(np.array(out)) + min_value = np.nanmin(np.array(out)) + always_non_full_zero = max_value > min_value + always_full_zero = max_value == 0.0 and min_value == 0.0 + self.assertTrue(always_full_zero or always_non_full_zero, + 'always_full_zero or always_non_full_zero.') + + def test_dygraph_api_out(self): + paddle.disable_static() + shape = [200, 3] + out = paddle.empty(shape=shape) + self.__check_out__(out) + paddle.enable_static() + + def test_dygraph_api_out_2(self): + paddle.disable_static() + shape_data = np.array([200, 3]).astype('int32') + shape = paddle.to_tensor(shape_data) + out = paddle.empty(shape=shape) + self.__check_out__(out) + paddle.enable_static() + + def test_dygraph_api_out_3(self): + paddle.disable_static() + shape_data = np.array([200, 3]).astype('int64') + shape = paddle.to_tensor(shape_data) + out = paddle.empty(shape=shape) + self.__check_out__(out) + paddle.enable_static() + + def test_dygraph_api_attr(self): + paddle.disable_static() + shape = [200, 3] + dtype = 'float64' + out = paddle.empty(shape=shape, dtype=dtype) + self.__check_out__(out, dtype) + paddle.enable_static() + + def test_static_graph(self): + dtype = 'float64' + + positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3) + positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3) + + shape_tensor_int32 = fluid.data( + name="shape_tensor_int32", shape=[2], dtype="int32") + shape_tensor_int64 = fluid.data( + name="shape_tensor_int64", shape=[2], dtype="int64") + + out_1 = paddle.empty(shape=[200, 3], dtype=dtype) + out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype) + out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype) + out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype) + out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + res_1, res_2, res_3, res_4, res_5 = exe.run( + fluid.default_main_program(), + feed={ + "shape_tensor_int32": np.array([200, 3]).astype("int32"), + "shape_tensor_int64": np.array([200, 3]).astype("int64"), + }, + fetch_list=[out_1, out_2, out_3, out_4, out_5]) + + self.__check_out__(res_1, dtype) + self.__check_out__(res_2, dtype) + self.__check_out__(res_3, dtype) + self.__check_out__(res_4, dtype) + self.__check_out__(res_5, dtype) + + +class TestEmptyError(unittest.TestCase): + def test_attr(self): + def test_dtype(): + shape = [200, 3] + dtype = 'uint8' + result = paddle.empty(shape=shape, dtype=dtype) + + self.assertRaises(TypeError, test_dtype) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py index 1181272bd98b00f65e6925b44da814662f96045f..37d269e3369bfe7db00529dea5e08b287151691a 100644 --- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py +++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py @@ -28,7 +28,7 @@ class TestFetchUnmerged(unittest.TestCase): conv_pool_1 = fluid.nets.simple_img_conv_pool( input=img, filter_size=5, - num_filters=20, + num_filters=8, pool_size=2, pool_stride=2, pool_type='max', @@ -37,12 +37,12 @@ class TestFetchUnmerged(unittest.TestCase): conv_pool_2 = fluid.nets.simple_img_conv_pool( input=conv_pool_1, filter_size=5, - num_filters=50, + num_filters=16, pool_size=2, pool_stride=2, pool_type='avg', act="relu") - hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu') + hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) avg_loss = fluid.layers.mean(loss) @@ -75,8 +75,8 @@ class TestFetchUnmerged(unittest.TestCase): binary = fluid.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) - iters = 3 - batch_size = 64 + iters = 2 + batch_size = 16 train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py index 38c3903306e6e76188cdb50476d6797814c434e9..73e014b35008ff5a0539c6a338755b9dc2cf68d4 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -57,7 +57,7 @@ class TestFleetAMPOptimizer(unittest.TestCase): ops = [op.type for op in avg_cost.block.ops] self.assertIn('cast', ops) - self.assertIn('isfinite', ops) + self.assertIn('check_finite_and_unscale', ops) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index 8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643..6f8af3017efcb9010b129131a01c5ee071b5bc36 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase): def test_localsgd_configs(self): strategy = paddle.distributed.fleet.DistributedStrategy() - configs = {"k_steps": 4} + configs = {"k_steps": 4, "begin_step": 120} strategy.localsgd_configs = configs self.assertEqual(strategy.localsgd_configs["k_steps"], 4) + self.assertEqual(strategy.localsgd_configs["begin_step"], 120) def test_dgc(self): strategy = paddle.distributed.fleet.DistributedStrategy() @@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase): strategy.a_sync = True strategy.localsgd = True strategy.dgc = True - localsgd_configs = {"k_steps": 5} + localsgd_configs = {"k_steps": 5, "begin_step": 1} strategy.localsgd_configs = localsgd_configs build_strategy = paddle.fluid.BuildStrategy() build_strategy.enable_sequential_execution = True @@ -316,6 +317,14 @@ class TestStrategyConfig(unittest.TestCase): self.assertEqual(strategy.conv_workspace_size_limit, 1000) strategy._enable_env() + def test_distributed_strategy_repr(self): + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.recompute = True + strategy.recompute_configs = {"checkpoints": ["a1", "a2", "a3"]} + strategy.amp = True + strategy.localsgd = True + print(str(strategy)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py index 3f140f53b043b1949572f3728ca8a0c556317783..ff305fb95231b96b6d8f951b2943a0ab47060ce0 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py @@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker class TestFleetLambMetaOptimizer(unittest.TestCase): def setUp(self): - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - os.environ["PADDLE_TRAINERS_NUM"] = "2" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ - "127.0.0.1:36001,127.0.0.2:36001" + os.environ["PADDLE_TRAINER_ID"] = "1" + os.environ[ + "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" def net(self, main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog): @@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase): optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) - ops_with_bias = [ + ops_without_wd = [ op for op in avg_cost.block.ops if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0') ] - for op in ops_with_bias: + for op in ops_without_wd: self.assertEqual(op.attr('weight_decay'), 0) + def test_lamb_apply_with_amp(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.amp = True + strategy.amp_configs = { + "init_loss_scaling": 32768, + "decr_every_n_nan_or_inf": 2, + "incr_every_n_steps": 1000, + "incr_ratio": 2.0, + "use_dynamic_loss_scaling": True, + "decr_ratio": 0.5, + "custom_white_list": ['softmax'], + "custom_black_list": ['tanh'], + } + strategy.lamb = True + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } + + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops = [op.type for op in avg_cost.block.ops] + self.assertIn('lamb', ops) + self.assertIn('cast', ops) + self.assertIn('isfinite', ops) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py index 3caa1a4eac0bf191b13e6708b1a9adffdb111ca7..34ab423e064eebb9c93010fbc869adedb42bd6fa 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py @@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker class TestFleetLarsMetaOptimizer(unittest.TestCase): def setUp(self): - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - os.environ["PADDLE_TRAINERS_NUM"] = "2" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ - "127.0.0.1:36001,127.0.0.2:36001" + os.environ["PADDLE_TRAINER_ID"] = "1" + os.environ[ + "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" def net(self, main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog): @@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): strategy.lars_configs = { "lars_coeff": 0.001, "lars_weight_decay": 0.0005, + "epsilon": 0, + "exclude_from_weight_decay": ["batch_norm", ".b"], } return avg_cost, strategy @@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): ops = [op.type for op in avg_cost.block.ops] self.assertNotIn('lars_momentum', ops) + def test_lars_exclude_fn(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) + + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops_without_wd = [ + op for op in avg_cost.block.ops + if op.type == 'lars_momentum' and ("batch_norm" in op.attr( + 'op_role_var')[0] or ".b" in op.attr('op_role_var')[0]) + ] + for op in ops_without_wd: + self.assertEqual(op.attr('lars_weight_decay'), 0) + + def test_lars_apply_with_amp(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.amp = True + strategy.amp_configs = { + "init_loss_scaling": 32768, + "decr_every_n_nan_or_inf": 2, + "incr_every_n_steps": 1000, + "incr_ratio": 2.0, + "use_dynamic_loss_scaling": True, + "decr_ratio": 0.5, + "custom_white_list": ['softmax'], + "custom_black_list": ['tanh'], + } + strategy.lars = True + strategy.lars_configs = { + "lars_coeff": 0.001, + "lars_weight_decay": 0.0005, + "epsilon": 0, + "exclude_from_weight_decay": ["batch_norm", ".b"], + } + + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops = [op.type for op in avg_cost.block.ops] + self.assertIn('lars_momentum', ops) + self.assertIn('cast', ops) + self.assertIn('isfinite', ops) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py index 07b988bf8752057e68925bc42f564a72d466361d..945f5ae57454b2c4a509badb93574a6e03b607e8 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py @@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase): strategy.auto = True config = strategy.localsgd_configs config['k_steps'] = 1 + config['begin_step'] = 1 strategy.localsgd_configs = config optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py index 921dbdbc6d4e1b169c2c8aa199ea15f886bd0128..5bcfc8720ddd2a8b495c50f886c03047c9abdb32 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py @@ -25,7 +25,7 @@ class TestFuseBatchNormActPass(unittest.TestCase): hidden1 = fluid.layers.conv2d( input=x, filter_size=3, - num_filters=32, + num_filters=16, stride=1, padding=1, act=None, @@ -43,7 +43,7 @@ class TestFuseBatchNormActPass(unittest.TestCase): bias_attr=bias_attr, act='relu', data_layout='NHWC') - hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu') + hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu') hidden4 = fluid.layers.batch_norm( input=hidden3, act='relu', data_layout='NHWC') prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax') @@ -63,7 +63,7 @@ class TestFuseBatchNormActPass(unittest.TestCase): startup_program = fluid.Program() x, y, loss = self.build_program(main_program, startup_program, use_cuda) exe = fluid.Executor(place) - iters = 10 + iters = 8 batch_size = 16 feeder = fluid.DataFeeder(feed_list=[x, y], place=place) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py index 7fb2cb0090da57ae837d1f774518dd90a41df56c..9b2d71c9f907779bc9b27b51e21056496f8d4dd5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py @@ -18,6 +18,7 @@ import multiprocessing import numpy as np import paddle.fluid as fluid from paddle.fluid import core +from paddle.fluid.reader import _reader_process_loop if sys.version_info[0] == 2: import Queue as queue @@ -66,7 +67,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase): batch_generator_creator(self.batch_size, self.batch_num), places=fluid.CPUPlace()) loader._data_queue = queue.Queue(self.batch_num + 1) - loader._reader_process_loop() + _reader_process_loop(loader._batch_reader, loader._data_queue) # For clean memory mapped files util_queue = multiprocessing.Queue(self.batch_num + 1) for _ in range(self.batch_num): @@ -94,7 +95,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase): loader._data_queue = queue.Queue(self.batch_num + 1) exception = None try: - loader._reader_process_loop() + _reader_process_loop(loader._batch_reader, loader._data_queue) except core.EnforceNotMet as ex: exception = ex self.assertIsNotNone(exception) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py index da01be8159a5c5d277a22134eb60ef37ef85fc4f..772dd913e4d20ccf51601ea620822c250cb45320 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py @@ -15,18 +15,26 @@ import unittest import numpy as np import paddle.fluid as fluid +import warnings class TestImperativeNumpyBridge(unittest.TestCase): def test_tensor_from_numpy(self): data_np = np.array([[2, 3, 1]]).astype('float32') with fluid.dygraph.guard(fluid.CPUPlace()): - var = fluid.dygraph.to_variable(data_np, zero_copy=True) - self.assertTrue(np.array_equal(var.numpy(), data_np)) - data_np[0][0] = 4 - self.assertEqual(data_np[0][0], 4) - self.assertEqual(var[0][0].numpy()[0], 4) - self.assertTrue(np.array_equal(var.numpy(), data_np)) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + var = fluid.dygraph.to_variable(data_np, zero_copy=True) + assert "Currently, zero_copy is not supported, and it will be discarded." in str( + w[-1].message) + # Temporally diable zero_copy + # var = fluid.dygraph.to_variable(data_np, zero_copy=True) + # self.assertTrue(np.array_equal(var.numpy(), data_np)) + # data_np[0][0] = 4 + # self.assertEqual(data_np[0][0], 4) + # self.assertEqual(var[0][0].numpy()[0], 4) + # self.assertTrue(np.array_equal(var.numpy(), data_np)) + var2 = fluid.dygraph.to_variable(data_np, zero_copy=False) self.assertTrue(np.array_equal(var2.numpy(), data_np)) data_np[0][0] = -1 diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index f7fcc1ff561b90dc1b78a67ffbe7c047ed06d0e9..7e6ca8076de5186def1229b58bd23df73021430e 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -23,7 +23,7 @@ from paddle.static import InputSpec import paddle.fluid as fluid from paddle.fluid.dygraph import Linear from paddle.fluid.dygraph import declarative, ProgramTranslator -from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME +from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME BATCH_SIZE = 32 BATCH_NUM = 10 @@ -56,6 +56,16 @@ class LinearNet(fluid.dygraph.Layer): return self._linear(x) +class LinearNetWithInputSpec(fluid.dygraph.Layer): + def __init__(self, in_size, out_size): + super(LinearNetWithInputSpec, self).__init__() + self._linear = Linear(in_size, out_size) + + @declarative(input_spec=[InputSpec(shape=[None, 784], dtype='float32')]) + def forward(self, x): + return self._linear(x) + + class LinearNetNotDeclarative(fluid.dygraph.Layer): def __init__(self, in_size, out_size): super(LinearNetNotDeclarative, self).__init__() @@ -65,6 +75,23 @@ class LinearNetNotDeclarative(fluid.dygraph.Layer): return self._linear(x) +class LinerNetWithLabel(paddle.nn.Layer): + def __init__(self, in_size, out_size): + super(LinerNetWithLabel, self).__init__() + self._linear = Linear(in_size, out_size) + + @declarative(input_spec=[ + InputSpec( + shape=[None, 784], dtype='float32', name="image"), InputSpec( + shape=[None, 1], dtype='int64', name="label") + ]) + def forward(self, x, label): + out = self._linear(x) + loss = fluid.layers.cross_entropy(out, label) + avg_loss = fluid.layers.mean(loss) + return out, avg_loss + + class LinearNetReturnLoss(fluid.dygraph.Layer): def __init__(self, in_size, out_size): super(LinearNetReturnLoss, self).__init__() @@ -78,6 +105,72 @@ class LinearNetReturnLoss(fluid.dygraph.Layer): return z, loss +class LinearNetMultiInput(fluid.dygraph.Layer): + def __init__(self, in_size, out_size): + super(LinearNetMultiInput, self).__init__() + self._linear1 = Linear(in_size, out_size) + self._linear2 = Linear(in_size, out_size) + + @declarative(input_spec=[ + InputSpec( + [None, 8], dtype='float32'), InputSpec( + [None, 8], dtype='float32') + ]) + def forward(self, x, y): + x_out = self._linear1(x) + y_out = self._linear2(y) + loss = fluid.layers.mean(x_out + y_out) + return x_out, y_out, loss + + +class MultiLoadingLinearNet(fluid.dygraph.Layer): + def __init__(self, size, model_path): + super(MultiLoadingLinearNet, self).__init__() + self._linear = Linear(size, size) + self._load_linear1 = fluid.dygraph.jit.load(model_path) + self._load_linear2 = fluid.dygraph.jit.load(model_path) + + @declarative + def forward(self, x): + tmp1 = self._linear(x) + tmp2 = self._load_linear1(tmp1) + tmp3 = self._load_linear2(tmp2) + y = self._linear(tmp3) + return y + + +class LinearNetReturnHidden(fluid.dygraph.Layer): + def __init__(self, in_size, out_size): + super(LinearNetReturnHidden, self).__init__() + self._linear_1 = Linear(in_size, out_size) + self._linear_2 = Linear(in_size, out_size) + + @declarative + def forward(self, x): + y = self._linear_1(x) + z = self._linear_2(y) + loss = fluid.layers.mean(z) + return y, loss + + +class EmptyLayer(paddle.nn.Layer): + def __init__(self): + super(EmptyLayer, self).__init__() + + @paddle.jit.to_static + def forward(self, x): + return x + + +class NoParamLayer(paddle.nn.Layer): + def __init__(self): + super(NoParamLayer, self).__init__() + + @paddle.jit.to_static + def forward(self, x, y): + return x + y + + def train(layer, input_size=784, label_size=1): # create optimizer sgd = fluid.optimizer.SGDOptimizer( @@ -102,6 +195,27 @@ def train(layer, input_size=784, label_size=1): return [img], layer, avg_loss +def train_with_label(layer, input_size=784, label_size=1): + # create optimizer + sgd = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=layer.parameters()) + # create data loader + train_loader = fluid.io.DataLoader.from_generator(capacity=5) + train_loader.set_batch_generator( + random_batch_reader(input_size, label_size)) + # train + for data in train_loader(): + img, label = data + label.stop_gradient = True + + out, avg_loss = layer(img, label) + + avg_loss.backward() + sgd.minimize(avg_loss) + layer.clear_gradients() + return out + + class TestJitSaveLoad(unittest.TestCase): def setUp(self): self.model_path = "model.test_jit_save_load" @@ -159,8 +273,11 @@ class TestJitSaveLoad(unittest.TestCase): train_layer.eval() # construct new model new_layer = LinearNet(784, 1) - model_dict, _ = fluid.dygraph.load_dygraph(self.model_path) - new_layer.set_dict(model_dict) + orig_state_dict = new_layer.state_dict() + load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path) + for structured_name in orig_state_dict: + self.assertTrue(structured_name in load_state_dict) + new_layer.set_state_dict(load_state_dict) new_layer.eval() # inference & compare x = fluid.dygraph.to_variable( @@ -168,38 +285,20 @@ class TestJitSaveLoad(unittest.TestCase): self.assertTrue( np.array_equal(train_layer(x).numpy(), new_layer(x).numpy())) - def test_save_get_program_failed(self): - layer = LinearNetNotDeclarative(784, 1) - example_inputs, layer, _ = train(layer) - with self.assertRaises(RuntimeError): - fluid.dygraph.jit.save( - layer=layer, - model_path=self.model_path, - input_spec=example_inputs) - def test_load_dygraph_no_path(self): model_path = "model.test_jit_save_load.no_path" new_layer = LinearNet(784, 1) with self.assertRaises(ValueError): model_dict, _ = fluid.dygraph.load_dygraph(model_path) - -class LinearNetMultiInput(fluid.dygraph.Layer): - def __init__(self, in_size, out_size): - super(LinearNetMultiInput, self).__init__() - self._linear1 = Linear(in_size, out_size) - # self._linear2 = Linear(in_size, out_size) - - @declarative(input_spec=[ - InputSpec( - [None, 8], dtype='float32'), InputSpec( - [None, 8], dtype='float32') - ]) - def forward(self, x, y): - x_out = self._linear1(x) - y_out = self._linear1(y) - loss = fluid.layers.mean(x_out + y_out) - return x_out, y_out, loss + def test_jit_load_model_incomplete(self): + model_path = "model.test_jit_save_load.remove_variables" + self.train_and_save_model(model_path=model_path) + # remove `__variables__` + var_path = os.path.join(model_path, VARIABLE_FILENAME) + os.remove(var_path) + with self.assertRaises(ValueError): + paddle.jit.load(model_path) class TestSaveLoadWithInputSpec(unittest.TestCase): @@ -345,22 +444,6 @@ class TestJitSaveLoadConfig(unittest.TestCase): np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy())) -class MultiLoadingLinearNet(fluid.dygraph.Layer): - def __init__(self, size, model_path): - super(MultiLoadingLinearNet, self).__init__() - self._linear = Linear(size, size) - self._load_linear1 = fluid.dygraph.jit.load(model_path) - self._load_linear2 = fluid.dygraph.jit.load(model_path) - - @declarative - def forward(self, x): - tmp1 = self._linear(x) - tmp2 = self._load_linear1(tmp1) - tmp3 = self._load_linear2(tmp2) - y = self._linear(tmp3) - return y - - class TestJitMultipleLoading(unittest.TestCase): def setUp(self): self.linear_size = 4 @@ -389,20 +472,6 @@ class TestJitMultipleLoading(unittest.TestCase): name_set.add(var.name) -class LinearNetReturnHidden(fluid.dygraph.Layer): - def __init__(self, in_size, out_size): - super(LinearNetReturnHidden, self).__init__() - self._linear_1 = Linear(in_size, out_size) - self._linear_2 = Linear(in_size, out_size) - - @declarative - def forward(self, x): - y = self._linear_1(x) - z = self._linear_2(y) - loss = fluid.layers.mean(z) - return y, loss - - class TestJitPruneModelAndLoad(unittest.TestCase): def setUp(self): self.linear_size = 4 @@ -461,5 +530,230 @@ class TestJitPruneModelAndLoad(unittest.TestCase): fluid.dygraph.jit.load(self.model_path) +class TestJitSaveMultiCases(unittest.TestCase): + def setUp(self): + # enable dygraph mode + fluid.enable_dygraph() + # config seed + paddle.manual_seed(SEED) + paddle.framework.random._manual_program_seed(SEED) + + def verify_inference_correctness(self, layer, model_path, with_label=False): + layer.eval() + loaded_layer = paddle.jit.load(model_path) + loaded_layer.eval() + # inference & compare + x = paddle.to_variable(np.random.random((1, 784)).astype('float32')) + if with_label: + y = paddle.to_variable(np.random.random((1, 1)).astype('int64')) + pred, _ = layer(x, y) + pred = pred.numpy() + else: + pred = layer(x).numpy() + loaded_pred = loaded_layer(x).numpy() + self.assertTrue( + np.array_equal(pred, loaded_pred), + msg="Result diff when load and inference:\nlayer result:\n{}\n" \ + "loaded layer result:\n{}".format(pred, loaded_pred)) + + def test_no_prune_to_static_after_train(self): + layer = LinearNet(784, 1) + + train(layer) + + model_path = "test_no_prune_to_static_after_train" + paddle.jit.save(layer, model_path) + + self.verify_inference_correctness(layer, model_path) + + def test_no_prune_to_static_no_train(self): + layer = LinearNetWithInputSpec(784, 1) + + model_path = "test_no_prune_to_static_no_train" + paddle.jit.save(layer, model_path) + + self.verify_inference_correctness(layer, model_path) + + def test_no_prune_no_to_static_after_train(self): + layer = LinearNetNotDeclarative(784, 1) + + train(layer) + + model_path = "test_no_prune_no_to_static_after_train" + paddle.jit.save( + layer, + model_path, + input_spec=[InputSpec( + shape=[None, 784], dtype='float32')]) + + self.verify_inference_correctness(layer, model_path) + + def test_no_prune_no_to_static_after_train_with_examples(self): + layer = LinearNetNotDeclarative(784, 1) + + example_inputs, _, _ = train(layer) + + model_path = "test_no_prune_no_to_static_after_train_with_examples" + fluid.dygraph.jit.save( + layer=layer, model_path=model_path, input_spec=example_inputs) + + self.verify_inference_correctness(layer, model_path) + + def test_no_prune_no_to_static_no_train(self): + layer = LinearNetNotDeclarative(784, 1) + + model_path = "test_no_prune_no_to_static_no_train" + paddle.jit.save( + layer, + model_path, + input_spec=[InputSpec( + shape=[None, 784], dtype='float32')]) + + self.verify_inference_correctness(layer, model_path) + + def test_prune_to_static_after_train(self): + layer = LinerNetWithLabel(784, 1) + + out = train_with_label(layer) + + model_path = "test_prune_to_static_after_train" + configs = paddle.SaveLoadConfig() + configs.output_spec = [out] + paddle.jit.save( + layer, + model_path, + input_spec=[ + InputSpec( + shape=[None, 784], dtype='float32', name="image") + ], + configs=configs) + + self.verify_inference_correctness(layer, model_path, True) + + def test_prune_to_static_no_train(self): + layer = LinerNetWithLabel(784, 1) + + model_path = "test_prune_to_static_no_train" + configs = paddle.SaveLoadConfig() + # TODO: no train, cannot get output_spec var here + # now only can use index + configs.output_spec = layer.forward.outputs[:1] + paddle.jit.save( + layer, + model_path, + input_spec=[ + InputSpec( + shape=[None, 784], dtype='float32', name="image") + ], + configs=configs) + + self.verify_inference_correctness(layer, model_path, True) + + def test_no_prune_input_spec_name_warning(self): + layer = LinearNetWithInputSpec(784, 1) + + train(layer) + + model_path = "test_no_prune_input_spec_name_warning" + paddle.jit.save( + layer, + model_path, + input_spec=[InputSpec( + shape=[None, 784], dtype='float32')]) + paddle.jit.save( + layer, + model_path, + input_spec=[ + InputSpec( + shape=[None, 784], dtype='float32', name='feed_input') + ]) + + self.verify_inference_correctness(layer, model_path) + + def test_not_prune_output_spec_name_warning(self): + layer = LinearNet(784, 1) + + train(layer) + + model_path = "test_not_prune_output_spec_name_warning" + configs = paddle.SaveLoadConfig() + out = paddle.to_variable(np.random.random((1, 1)).astype('float')) + configs.output_spec = [out] + paddle.jit.save(layer, model_path, configs=configs) + + self.verify_inference_correctness(layer, model_path) + + def test_prune_input_spec_name_error(self): + layer = LinerNetWithLabel(784, 1) + + model_path = "test_prune_input_spec_name_error" + with self.assertRaises(ValueError): + paddle.jit.save( + layer, + model_path, + input_spec=[InputSpec( + shape=[None, 784], dtype='float32')]) + with self.assertRaises(ValueError): + paddle.jit.save( + layer, + model_path, + input_spec=[ + InputSpec( + shape=[None, 784], dtype='float32', name='feed_input') + ]) + + def test_prune_output_spec_name_error(self): + layer = LinerNetWithLabel(784, 1) + + train_with_label(layer) + + model_path = "test_prune_to_static_after_train" + configs = paddle.SaveLoadConfig() + out = paddle.to_variable(np.random.random((1, 1)).astype('float')) + configs.output_spec = [out] + with self.assertRaises(ValueError): + paddle.jit.save( + layer, + model_path, + input_spec=[ + InputSpec( + shape=[None, 784], dtype='float32', name="image") + ], + configs=configs) + + +class TestJitSaveLoadEmptyLayer(unittest.TestCase): + def setUp(self): + self.model_path = "model.jit_save_load_empty_layer" + # enable dygraph mode + paddle.disable_static() + + def test_save_load_empty_layer(self): + layer = EmptyLayer() + x = paddle.to_variable(np.random.random((10)).astype('float32')) + out = layer(x) + paddle.jit.save(layer, self.model_path) + load_layer = paddle.jit.load(self.model_path) + load_out = load_layer(x) + self.assertTrue(np.array_equal(out, load_out)) + + +class TestJitSaveLoadNoParamLayer(unittest.TestCase): + def setUp(self): + self.model_path = "model.jit_save_load_no_param_layer" + # enable dygraph mode + paddle.disable_static() + + def test_save_load_no_param_layer(self): + layer = NoParamLayer() + x = paddle.to_variable(np.random.random((5)).astype('float32')) + y = paddle.to_variable(np.random.random((5)).astype('float32')) + out = layer(x, y) + paddle.jit.save(layer, self.model_path) + load_layer = paddle.jit.load(self.model_path) + load_out = load_layer(x, y) + self.assertTrue(np.array_equal(out, load_out)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py index c2201a52605bc87246fb9c8734494b19f83ff180..cf9203dffcbaa5da641b3f7cb8925ac9efcbe115 100644 --- a/python/paddle/fluid/tests/unittests/test_logsumexp.py +++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py @@ -46,8 +46,8 @@ class TestLogsumexp(OpTest): self.inputs = {'X': x} self.outputs = {'Out': out} self.attrs = { - 'dim': self.axis, - 'keep_dim': self.keepdim, + 'axis': self.axis, + 'keepdim': self.keepdim, 'reduce_all': self.reduce_all } diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py index 00137f63e244a0e166047e89f9ef436da158ed16..f6eff22d6ce5f06d8853d6244f79b4b07b3fa4f5 100644 --- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py +++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py @@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase): @prog_scope() def test_integer_div(self): a = fluid.layers.data(name="a", shape=[1], dtype='int64') - b = a / 2 + b = a / 7 place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.array([3, 4, 10, 14, 9, 18]) + a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64') b_np, = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - # for paddle2.0, use true_divide - b_np_actual = (a_np / 2.0) + + b_np_actual = (a_np / 7).astype('int64') self.assertTrue(numpy.array_equal(b_np, b_np_actual)) @prog_scope() diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py index 3a8867f6bd29f5bc0e512f9c8b22ecf192253fc7..6fd14b40bc9108b6075a0ac1f40cbefd79b8f0d9 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py @@ -27,6 +27,7 @@ import paddle.fluid.core as core from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.base import to_variable +from paddle.fluid.dataloader.dataloader_iter import _worker_loop class RandomDataset(Dataset): @@ -185,9 +186,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase): for i in range(10): indices_queue.put([i, i + 10]) indices_queue.put(None) - loader._worker_loop( - loader._dataset, 0, indices_queue, loader._data_queue, - loader._workers_done_event, _collate_fn, _init_fn, 0, 1) + _worker_loop(loader._dataset, 0, indices_queue, + loader._data_queue, loader._workers_done_event, + _collate_fn, _init_fn, 0, 1, + loader._use_shared_memory) self.assertTrue(False) except AssertionError: pass @@ -228,9 +230,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase): indices_queue.put([i, i + 10]) indices_queue.put(None) loader._workers_done_event.set() - loader._worker_loop( - loader._dataset, 0, indices_queue, loader._data_queue, - loader._workers_done_event, _collate_fn, _init_fn, 0, 1) + _worker_loop(loader._dataset, 0, indices_queue, + loader._data_queue, loader._workers_done_event, + _collate_fn, _init_fn, 0, 1, + loader._use_shared_memory) self.assertTrue(True) except AssertionError: pass diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py index 0c39dc5e731d25720149af4480020a7ab3ac5bb9..5d1e016287e07a8505336e6cb447c0e1b29a2ec2 100644 --- a/python/paddle/fluid/tests/unittests/test_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py @@ -17,6 +17,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.core as core @@ -153,6 +154,30 @@ class TestMulDoubleGradCheck(unittest.TestCase): class TestReshapeDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + x_shape = [3, 12] + expand_times = [4, 9] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', x_shape, False, dtype) + x.persistable = True + out = layers.expand(x, expand_times) + x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype) + + gradient_checker.double_grad_check( + [x], out, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +class TestExpandDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): x_shape = [3, 12] @@ -176,5 +201,53 @@ class TestReshapeDoubleGradCheck(unittest.TestCase): self.func(p) +class TestTileDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + x_shape = [3, 12] + repeat_times = [4, 9] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', x_shape, False, dtype) + x.persistable = True + out = paddle.tile(x, repeat_times) + x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype) + + gradient_checker.double_grad_check( + [x], out, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +class TestExpandV2DoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + x_shape = [1, 12] + new_shape = [4, 12] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', x_shape, False, dtype) + x.persistable = True + out = paddle.expand(x, new_shape) + x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype) + + gradient_checker.double_grad_check( + [x], out, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py index 4b2914c223a08c52444e085f0ef9e41518694593..c1992d0d539a5c6499b9b8d022b88997729ef782 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py @@ -261,7 +261,13 @@ class TestMultiOptimizersMultiCardsError(unittest.TestCase): exe.run(startup_program) np.random.seed(SEED) + + # NOTE(liym27): + # This test needs to run in multi cards to test NotImplementedError. + # Here, move this test from RUN_TYPE=DIST in tests/unittests/CMakeList.txt, + # to use multi cards ** only on CPU ** not GPU to reduce CI time. os.environ['CPU_NUM'] = str(2) + pe_exe = fluid.ParallelExecutor( use_cuda=use_cuda, main_program=main_program, diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 6671a2def3cccd2acd76025e73486b06b4bb1471..ea59a7f584a2dd5a06d37ede160ace130fc93580 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -176,7 +176,7 @@ class TestCRFModel(unittest.TestCase): place=fluid.CPUPlace()) data = train_data() - for i in range(10): + for i in range(4): cur_batch = next(data) print(exe.run(train_cp, feed=feeder.feed(cur_batch), diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py index 7e2ef36c1a7fda5c31049ec9c752c5226bfb89dc..6ca194b2694b6c7537ceb94e11eb1a1a0aeb8d8d 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py @@ -248,8 +248,7 @@ class PolicyGradient(object): func=reward_func, x=[action, length], out=reward) neg_log_prob = layers.cross_entropy(act_prob, action) cost = neg_log_prob * reward - cost = (layers.reduce_sum(cost) / - layers.cast(layers.reduce_sum(length), "float32") + cost = (layers.reduce_sum(cost) / layers.reduce_sum(length) ) if length is not None else layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py index 6f713172f1b29d0df8eed212ab1b148d00d7d45e..1975e4306026ee459aa585c47afa74fce6a6aeed 100644 --- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py @@ -167,8 +167,9 @@ class API_TestDyUnsqueezeAxisTensor(unittest.TestCase): with fluid.dygraph.guard(): input1 = np.random.random([5, 10]).astype("int32") out1 = np.expand_dims(input1, axis=1) + out1 = np.expand_dims(out1, axis=2) input = fluid.dygraph.to_variable(input1) - output = paddle.unsqueeze(input, axis=paddle.to_tensor([1])) + output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2])) out_np = output.numpy() self.assertTrue(np.array_equal(out1, out_np)) self.assertEqual(out1.shape, out_np.shape) diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py new file mode 100644 index 0000000000000000000000000000000000000000..fb93334415c3046362090a143f6c15069793709a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py @@ -0,0 +1,250 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid as fluid +import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn + + +class TestUpdateLossScalingOp(OpTest): + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + found_inf = np.array([False], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', np.zeros_like(x))], + 'LossScaling': self.prev_loss_scaling * self.incr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def init(self): + self.incr_ratio = 2.0 + self.decr_ratio = 0.8 + self.dtype = np.float32 + self.prev_loss_scaling = np.array([2048]).astype(self.dtype) + self.num_good_steps = np.array([999], dtype=np.int32) + self.num_bad_steps = np.array([1], dtype=np.int32) + self.zero_steps = np.array([0], dtype=np.int32) + self.attrs = { + 'incr_every_n_steps': 1000, + 'decr_every_n_nan_or_inf': 2, + 'incr_ratio': self.incr_ratio, + 'decr_ratio': self.decr_ratio, + } + + def test_check_output(self): + self.check_output(no_check_set=['Out']) + + +class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + found_inf = np.array([True], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + x[i[0]][j[0]] = np.inf + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', np.zeros_like(x))], + 'LossScaling': self.prev_loss_scaling * self.decr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def test_check_output(self): + self.check_output() + + +class TestUpdateLossScalingLayer(unittest.TestCase): + def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data( + name="prev_loss_scaling", shape=[1], dtype='float32') + num_good_steps = fluid.data( + name="num_good_steps", shape=[1], dtype='int32') + num_bad_steps = fluid.data( + name="num_bad_steps", shape=[1], dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + found_inf_v = np.array([False]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling( + x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], a_v) + assert np.array_equal(result_v[1], b_v) + assert np.array_equal(result_v[0], result_v[2]) + assert np.array_equal(result_v[1], result_v[3]) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data( + name="prev_loss_scaling", shape=[1], dtype='float32') + num_good_steps = fluid.data( + name="num_good_steps", shape=[1], dtype='int32') + num_bad_steps = fluid.data( + name="num_bad_steps", shape=[1], dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + a_v[i[0]][j[0]] = np.inf + found_inf_v = np.array([True]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling( + x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], np.zeros_like(a_v)) + assert np.array_equal(result_v[1], np.zeros_like(b_v)) + assert np.array_equal(result_v[2], np.zeros_like(a_v)) + assert np.array_equal(result_v[3], np.zeros_like(b_v)) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def test_loss_scaling_cpu(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check(use_cuda=False) + + def test_loss_scaling_cpu_inf(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check_inf(use_cuda=False) + + def test_loss_scaling_gpu(self): + if fluid.core.is_compiled_with_cuda(): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check(use_cuda=True) + + def test_loss_scaling_gpu_inf(self): + if fluid.core.is_compiled_with_cuda(): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check_inf(use_cuda=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py index 0de0eeb464ad700abb2144e49a822582b8653589..afd3414943e9c94799aba5e5e747182623b0a095 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py @@ -25,6 +25,7 @@ no_check_set_white_list = [ 'unsqueeze2', 'cross_entropy2', 'seed', - 'amp_check_finite_and_scale', + 'check_finite_and_unscale', + 'update_loss_scaling', 'cudnn_lstm', ] diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 014c778eee98a386e82fffe46dcc932d55aa6574..9f7fb0185133f580deba64634b62d82955670641 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -450,7 +450,7 @@ def interpolate(x, for i in range(len(x.shape) - 2): scale_list.append(scale) attrs['scale'] = list(map(float, scale_list)) - elif isinstance(scale, list) or isinstance(scale, float): + elif isinstance(scale, list) or isinstance(scale, tuple): if len(scale) != len(x.shape) - 2: raise ValueError("scale_shape length should be {} for " "input {}-D tensor.".format( diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 6c139b0ddbbb996145e3a611839bf5e2e113f3cd..da086c0955e849619ccbce17a297ca4615a3f3d0 100644 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1009,8 +1009,7 @@ def ctc_loss(log_probs, loss_out = fluid.layers.squeeze(loss_out, [-1]) assert reduction in ['mean', 'sum', 'none'] if reduction == 'mean': - loss_out = paddle.mean(loss_out / paddle.cast(label_lengths, - loss_out.dtype)) + loss_out = paddle.mean(loss_out / label_lengths) elif reduction == 'sum': loss_out = paddle.sum(loss_out) return loss_out diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 3150b8c2d0363274dfb6fd3465110c89339cd4c9..708aaa788f60d56a2adb41c8a571079354b3c192 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -250,3 +250,47 @@ class Adam(Optimizer): stop_gradient=True) return adam_op + + @framework.dygraph_only + def step(self): + """ + Execute the optimizer and update parameters once. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import numpy as np + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5) + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_grad() + """ + parameter_list = self._parameter_list + self._dtype = None + params_grads = [] + for param in self._parameter_list: + if not param.trainable: + continue + if hasattr( + param, "_is_sparse" + ) and param._is_sparse and self.regularization is not None: + raise RuntimeError( + "Adam don't support weight_decay with sparse parameters, please set it to None." + ) + if param._grad_ivar() is not None: + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) + + optimize_ops = self._apply_optimize( + loss=None, startup_program=None, params_grads=params_grads) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 0fed32a1676759bd94961af0a8949d035ec48c8f..8bb584be2362e7b02bc5b7c5603b148d37499c2d 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -40,6 +40,7 @@ from .creation import full_like #DEFINE_ALIAS from .creation import triu #DEFINE_ALIAS from .creation import tril #DEFINE_ALIAS from .creation import meshgrid #DEFINE_ALIAS +from .creation import empty #DEFINE_ALIAS from .io import save #DEFINE_ALIAS from .io import load #DEFINE_ALIAS from .linalg import matmul #DEFINE_ALIAS diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 9eece1240d7d3c0b8a863091367e993047bd4527..8011b92964b7e21fd930f19cec954b27f470e0c6 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -48,6 +48,7 @@ __all__ = [ 'eye', 'full', 'full_like', + 'empty', 'triu', 'tril', 'meshgrid' @@ -62,8 +63,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy will be performed and return origin tensor, otherwise a new tensor will be constructed - and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype`` - and the current place is cpu, no copy will be performed. + and returned. The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then ``x.real`` is the real part, and ``x.imag`` is the imaginary part. @@ -208,20 +208,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): value=data, place=place, persistable=False, - zero_copy=True, + zero_copy=False, stop_gradient=stop_gradient) else: name = unique_name.generate('generated_tensor') real_tensor = paddle.Tensor( value=data.real, place=place, - zero_copy=True, + zero_copy=False, name=name + ".real", stop_gradient=stop_gradient) imag_tensor = paddle.Tensor( value=data.imag, place=place, - zero_copy=True, + zero_copy=False, name=name + ".imag", stop_gradient=stop_gradient) return paddle.ComplexTensor(real_tensor, imag_tensor) @@ -981,3 +981,90 @@ def diag(x, offset=0, padding_value=0, name=None): out.stop_gradient = True return out + + +def empty(shape, dtype=None, name=None): + """ + This Op returns a Tensor with uninitialized data which size is same as ``shape``. + + Args: + shape(list|tuple|Tensor): Shape of the Tensor to be created. + The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple, + the elements of it should be integers or Tensors with shape [1]. + If ``shape`` is an Tensor, it should be an 1-D Tensor. + dtype(np.dtype|str, optional): Data type of the output Tensor + which can be bool, float16, float32, float64, int32, int64, if dytpe is `None`, the data + type of created Tensor use global default dtype (see ``get_default_dtype`` + for details). + name(str, optional): The default value is None. Normally there is no need for user to set this + property. For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() # Now we are in imperative mode + paddle.set_device("cpu") # and use cpu device + + # example 1: argument ``shape`` is a list which doesn't contain Tensor. + data1 = paddle.empty(shape=[2,3], dtype='float32') + #[[4.3612203e+27 1.8176809e+31 1.3555911e-19] # uninitialized + # [1.1699684e-19 1.3563156e-19 3.6408321e-11]] # uninitialized + + # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32. + shape_data = np.array([2, 3]).astype('int32') + shape = paddle.to_tensor(shape_data) + data2 = paddle.empty(shape=shape, dtype='float32') + #[[1.7192326e-37 4.8125365e-38 1.9866003e-36] # uninitialized + # [1.3284029e-40 7.1117408e-37 2.5353012e+30]] # uninitialized + + # example 3: argument ``shape`` is a list which contains Tensor. + dim2_data = np.array([3]).astype('int32') + dim2 = paddle.to_tensor(dim2_data) + data3 = paddle.empty(shape=[2, dim2], dtype='float32') + #[[1.1024214e+24 7.0379409e+22 6.5737699e-34] # uninitialized + # [7.5563101e+31 7.7130405e+31 2.8020654e+20]] # uninitialized + """ + + if dtype is None: + dtype = paddle.get_default_dtype() + + dtype = convert_dtype(dtype) + + if in_dygraph_mode(): + shape = utils.convert_shape_to_list(shape) + out = core.ops.empty('shape', shape, 'dtype', + convert_np_dtype_to_dtype_(dtype)) + out.stop_gradient = True + return out + + helper = LayerHelper("empty", **locals()) + inputs = {} + + check_dtype(dtype, 'dtype', + ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], + 'empty') + check_type(shape, 'shape', (Variable, list, tuple), 'empty') + + if isinstance(shape, Variable): + check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty') + + attrs = {} + utils.get_shape_tensor_inputs( + inputs=inputs, attrs=attrs, shape=shape, op_type='empty') + + out = helper.create_variable_for_type_inference(dtype=dtype) + attrs['dtype'] = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='empty', + inputs=inputs, + outputs={'Out': [out]}, + attrs=attrs, + stop_gradient=True) + out.stop_gradient = True + return out diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index ed2bbe03a366054dfe7d798310c7fa5d419b44a8..966544c7abb54ae7de163aa322890a55ee94d3d8 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -64,7 +64,6 @@ from ..fluid.layers import increment #DEFINE_ALIAS from ..fluid.layers import multiplex #DEFINE_ALIAS from ..fluid.layers import sums #DEFINE_ALIAS from ..fluid import layers -import paddle __all__ = [ @@ -343,69 +342,9 @@ def divide(x, y, name=None): axis = -1 act = None if in_dygraph_mode(): - # rule 1 : avoid numpy.ndarray - if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray): - raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.") - - # rule 2: both the inputs are not Tensor - elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor): - x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x) - y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y) - - # rule 3: both the inputs are Tensor - elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor): - if y.dtype != x.dtype: - raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype." - "But x is {}, y is {}".format(x.dtype, y.dtype)) - elif x.dtype in _supported_int_dtype_: - x = x.astype(paddle.get_default_dtype()) - y = y.astype(paddle.get_default_dtype()) - - # rule 4: x is Tensor, y is scalar - elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor): - if x.dtype in _supported_int_dtype_: - x = x.astype(paddle.get_default_dtype()) - y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y) - - # rule 5: x is scalar, y is Tensor - elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor): - if y.dtype in _supported_int_dtype_: - y = y.astype(paddle.get_default_dtype()) - x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x) - return _elementwise_op_in_dygraph( x, y, axis=axis, act=act, op_name=op_type) - # rule 1 : avoid numpy.ndarray - if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray): - raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.") - - # rule 2: both the inputs are not Tensor - elif not isinstance(x, Variable) and not isinstance(y, Variable): - x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x) - y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y) - - # rule 3: both the inputs are Tensor - elif isinstance(x, Variable) and isinstance(y, Variable): - if y.dtype != x.dtype: - raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype." - "But x is {}, y is {}".format(x.dtype, y.dtype)) - elif x.dtype in _supported_int_dtype_: - x = paddle.cast(x, paddle.get_default_dtype()) - y = paddle.cast(y, paddle.get_default_dtype()) - - # rule 4: x is Tensor, y is scalar - elif isinstance(x, Variable) and not isinstance(y, Variable): - if x.dtype in _supported_int_dtype_: - x = paddle.cast(x, paddle.get_default_dtype()) - y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y) - - # rule 5: x is scalar, y is Tensor - elif not isinstance(x, Variable) and isinstance(y, Variable): - if y.dtype in _supported_int_dtype_: - y = paddle.cast(y, paddle.get_default_dtype()) - x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x) - return _elementwise_op(LayerHelper(op_type, **locals())) @@ -444,55 +383,9 @@ def floor_divide(x, y, name=None): op_type = 'elementwise_floordiv' axis = -1 if in_dygraph_mode(): - # rule 1 : avoid numpy.ndarray - if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray): - raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.") - - # rule 2: both the inputs are not Tensor - elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor): - x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x) - y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y) - - # rule 3: both the inputs are Tensor - elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor): - if y.dtype != x.dtype: - raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype." - "But x is {}, y is {}".format(x.dtype, y.dtype)) - - # rule 4: x is Tensor, y is scalar - elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor): - y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y) - - # rule 5: x is scalar, y is Tensor - elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor): - x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x) - return _elementwise_op_in_dygraph( x, y, axis=axis, op_name=op_type) - # rule 1 : avoid numpy.ndarray - if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray): - raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.") - - # rule 2: both the inputs are not Tensor - elif not isinstance(x, Variable) and not isinstance(y, Variable): - x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x) - y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y) - - # rule 3: both the inputs are Tensor - elif isinstance(x, Variable) and isinstance(y, Variable): - if y.dtype != x.dtype: - raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype." - "But x is {}, y is {}".format(x.dtype, y.dtype)) - - # rule 4: x is Tensor, y is scalar - elif isinstance(x, Variable) and not isinstance(y, Variable): - y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y) - - # rule 5: x is scalar, y is Tensor - elif not isinstance(x, Variable) and isinstance(y, Variable): - x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x) - return _elementwise_op(LayerHelper(op_type, **locals())) @@ -531,43 +424,9 @@ def remainder(x, y, name=None): op_type = 'elementwise_mod' axis = -1 if in_dygraph_mode(): - # rule 1 : avoid numpy.ndarray - if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray): - raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.") - - elif not isinstance(x, paddle.Tensor): - raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x))) - - # rule 3: both the inputs are Tensor - elif isinstance(y, paddle.Tensor): - if y.dtype != x.dtype: - raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype." - "But x is {}, y is {}".format(x.dtype, y.dtype)) - - # rule 4: x is Tensor, y is scalar - elif not isinstance(y, paddle.Tensor): - y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y) - return _elementwise_op_in_dygraph( x, y, axis=axis, op_name=op_type) - # rule 1 : avoid numpy.ndarray - if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray): - raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.") - - elif not isinstance(x, Variable): - raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x))) - - # rule 3: both the inputs are Tensor - elif isinstance(y, Variable): - if y.dtype != x.dtype: - raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype." - "But x is {}, y is {}".format(x.dtype, y.dtype)) - - # rule 4: x is Tensor, y is scalar - elif not isinstance(y, paddle.Tensor): - y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y) - return _elementwise_op(LayerHelper(op_type, **locals())) @@ -1194,15 +1053,14 @@ def logsumexp(x, axis=None, keepdim=False, name=None): axis = [0] if in_dygraph_mode(): - return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim, - 'reduce_all', reduce_all) + return core.ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all) check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'logsumexp') helper = LayerHelper('logsumexp', **locals()) - attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all} + attrs = {'axis': axis, 'keepdim': keepdim, 'reduce_all':reduce_all} out = helper.create_variable_for_type_inference(x.dtype) helper.append_op( type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 1e5179d0282d7f35c4232d9b9783cb831e83f462..84254cc89bb8eef12a95189416cd29cce828f5ca 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -287,12 +287,19 @@ fi pip install PyGithub # For getting PR related data wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate +wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true` if [ "${HASUTFIXED}" != "" ]; then echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n" check_approval 1 45041955 22165420 fi +HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has benchmark issue to be fixed" || true` +if [ "${HASUTFIXED}" != "" ]; then + echo_line="${HASUTFIXED} You must have one RD (hysunflower or xiegegege or Xreki) approval.\n" + check_approval 1 52739577 46314656 12538138 +fi + if [ -n "${echo_list}" ];then echo "****************" echo -e "${echo_list[@]}" diff --git a/tools/check_ut.py b/tools/check_ut.py index 7b5e5a4f1c55574edc3f28dac76ebf1d932748d7..f5fe4c687dd7828f001ddbab744d66931e37f532 100644 --- a/tools/check_ut.py +++ b/tools/check_ut.py @@ -27,9 +27,12 @@ class PRChecker(object): self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60) self.repo = None - def check(self): - """ check pr. """ - filename = 'block.txt' + def check(self, filename, msg): + """ + Args: + filename (str): File to get block names. + msg (str): Error message. + """ pr_id = os.getenv('GIT_PR_ID') if not pr_id: print('No PR ID') @@ -44,12 +47,10 @@ class PRChecker(object): with open(filename) as f: for l in f: if l.rstrip('\r\n') == user: - print('{} has unit-test to be fixed, so CI failed.'.format( - user)) - exit(1) - exit(0) + print('{} {}'.format(user, msg)) if __name__ == '__main__': pr_checker = PRChecker() - pr_checker.check() + pr_checker.check('block.txt', 'has unit-test to be fixed, so CI failed.') + pr_checker.check('bk.txt', 'has benchmark issue to be fixed, so CI failed.') diff --git a/tools/enforce/count_all_enforce.sh b/tools/enforce/count_all_enforce.sh index c1b7508de0361b7a9036557f88fd0b10f326dcc6..683b73614d29bb42871c63dc94d365626d0375ad 100644 --- a/tools/enforce/count_all_enforce.sh +++ b/tools/enforce/count_all_enforce.sh @@ -39,7 +39,7 @@ # Valid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 1706 # Invalid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 4572 -ROOT_DIR=../paddle/fluid +ROOT_DIR=../../paddle/fluid ALL_PADDLE_CHECK_CNT=0 VALID_PADDLE_CHECK_CNT=0 diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh index 03233d417ac88eef775e1ca6a77d0600a4faa361..3cb13edf7cc27f6a0de45080a7b90e4b4e24b6b5 100644 --- a/tools/enforce/count_enforce_by_dir.sh +++ b/tools/enforce/count_enforce_by_dir.sh @@ -59,7 +59,7 @@ . ./count_all_enforce.sh --source-only -ROOT_DIR=../paddle/fluid +ROOT_DIR=../../paddle/fluid function count_dir_independently(){ local sub_dir_total_check_cnt=0