提交 83c2d35a 编写于 作者: S seiriosPlus

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize/large_scale_kv_spped

...@@ -16,7 +16,7 @@ else() ...@@ -16,7 +16,7 @@ else()
set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80") set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
endif() endif()
###################################################################################### ######################################################################################
......
...@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc") ...@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
SET(DGC_URL "http://fleet.bj.bcebos.com/collective_ef2216a.tgz") SET(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
cache_third_party(extern_dgc cache_third_party(extern_dgc
...@@ -30,7 +30,7 @@ ExternalProject_Add( ...@@ -30,7 +30,7 @@ ExternalProject_Add(
extern_dgc extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
"${DGC_DOWNLOAD_CMD}" "${DGC_DOWNLOAD_CMD}"
URL_MD5 "2f67549fd5f1262383d83289abc4f88f" URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
PREFIX "${DGC_PREFIX_DIR}" PREFIX "${DGC_PREFIX_DIR}"
SOURCE_DIR "${DGC_SOURCES_DIR}" SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG) if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa) set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
endif() endif()
if(NOT CUDA_ARCH_NAME) if(NOT CUDA_ARCH_NAME)
......
...@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn) ...@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn)
SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git) SET(MKLDNN_REPOSITORY https://github.com/oneapi-src/oneDNN.git)
SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677) SET(MKLDNN_TAG 64a48f9565aa72f6359917b3406328075a409939)
# Introduce variables: # Introduce variables:
# * CMAKE_INSTALL_LIBDIR # * CMAKE_INSTALL_LIBDIR
......
...@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) ...@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git) set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git)
set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e) set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE) CACHE PATH "Warp-ctc Directory" FORCE)
......
...@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag) ...@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
endfunction() endfunction()
CheckCompilerCXX11Flag() CheckCompilerCXX11Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
# safe_set_flag # safe_set_flag
# #
# Set a compile flag only if compiler is support # Set a compile flag only if compiler is support
......
...@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
ENDIF() ENDIF()
if(WITH_GPU) if(WITH_GPU)
include(external/cub) # download cub if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND third_party_deps extern_cub) include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU) endif(WITH_GPU)
......
...@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs( ...@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
for (size_t i = 0; i < op_num; ++i) { for (size_t i = 0; i < op_num; ++i) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true, grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
"Cannot serialize message."); paddle::platform::errors::Unavailable(
"Cannot serialize operator desc message."));
} }
} }
return ret; return ret;
......
...@@ -36,7 +36,10 @@ message AMPConfig { ...@@ -36,7 +36,10 @@ message AMPConfig {
repeated string custom_black_varnames = 9; repeated string custom_black_varnames = 9;
} }
message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; } message LocalSGDConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig { message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ]; optional int32 k_steps = 1 [ default = 1 ];
...@@ -52,6 +55,8 @@ message DGCConfig { ...@@ -52,6 +55,8 @@ message DGCConfig {
message LarsConfig { message LarsConfig {
optional float lars_coeff = 1 [ default = 0.001 ]; optional float lars_coeff = 1 [ default = 0.001 ];
optional float lars_weight_decay = 2 [ default = 0.0005 ]; optional float lars_weight_decay = 2 [ default = 0.0005 ];
optional float epsilon = 3 [ default = 0.0 ];
repeated string exclude_from_weight_decay = 4;
} }
message LambConfig { message LambConfig {
......
...@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false; ...@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
void NCCLWrapper::InitNCCL() { void NCCLWrapper::InitNCCL() {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
nccl_info_.my_global_rank_)); nccl_info_.my_global_rank_));
#endif #endif
...@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { ...@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
NCCLInfo NCCLWrapper::GetNCCLId() { NCCLInfo NCCLWrapper::GetNCCLId() {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
#endif #endif
return nccl_info_; return nccl_info_;
} }
...@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, ...@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
nccl_info_.local_rank_ = local_rank; nccl_info_.local_rank_ = local_rank;
nccl_info_.my_global_rank_ = global_rank; nccl_info_.my_global_rank_ = global_rank;
nccl_info_.global_ranks_ = ranks; nccl_info_.global_ranks_ = ranks;
PADDLE_ENFORCE(cudaSetDevice(local_rank)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_))); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
#endif #endif
return; return;
} }
...@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, ...@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
auto var = scope.FindVar(name); auto var = scope.FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
int32_t total_size = tensor->numel(); int32_t total_size = tensor->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat, reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
root_rank, nccl_info_.comm_, nccl_info_.stream_)); root_rank, nccl_info_.comm_, nccl_info_.stream_));
cudaStreamSynchronize(nccl_info_.stream_); cudaStreamSynchronize(nccl_info_.stream_);
......
...@@ -42,7 +42,8 @@ void ThreadPool::Init() { ...@@ -42,7 +42,8 @@ void ThreadPool::Init() {
num_threads = FLAGS_dist_threadpool_size; num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads; VLOG(1) << "set dist_threadpool_size to " << num_threads;
} }
PADDLE_ENFORCE_GT(num_threads, 0); PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
"The number of threads is 0."));
threadpool_.reset(new ThreadPool(num_threads)); threadpool_.reset(new ThreadPool(num_threads));
} }
} }
...@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() { ...@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
} }
if (tasks_.empty()) { if (tasks_.empty()) {
PADDLE_THROW("This thread has no task to Run"); PADDLE_THROW(platform::errors::Unavailable(
"Current thread has no task to Run."));
} }
// pop a task from the task queue // pop a task from the task queue
......
...@@ -91,7 +91,8 @@ class ThreadPool { ...@@ -91,7 +91,8 @@ class ThreadPool {
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
if (!running_) { if (!running_) {
PADDLE_THROW("enqueue on stopped ThreadPool"); PADDLE_THROW(platform::errors::Unavailable(
"Task is enqueued into stopped ThreadPool."));
} }
tasks_.push(std::move(task)); tasks_.push(std::move(task));
} }
......
...@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) { ...@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
} break; } break;
default: default:
PADDLE_THROW( PADDLE_THROW(
"Setting 'sub_tensor_number' is not supported by the type of var %s.", platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
this->Name()); "supported by the %s type variable.",
this->Name()));
} }
} }
...@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const { ...@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(
"Getting 'sub_tensor_number' is not supported by the type of var %s.", platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
this->Name()); "supported by the %s type variable.",
this->Name()));
} }
} }
...@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { ...@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level); desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_level' is not supported by the type of var %s.", "Setting 'lod_level' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
...@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) { ...@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
} }
} break; } break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_levels' is not supported by the type of var %s.", "Setting 'lod_levels' is not supported by the %s type variable",
this->Name()); this->Name()));
} }
} }
...@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const { ...@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
case proto::VarType::LOD_TENSOR_ARRAY: case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().lod_level(); return desc_.type().tensor_array().lod_level();
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_level' is not supported by the type of var %s.", "Getting 'lod_level' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
...@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const { ...@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
return res; return res;
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_levels' is not supported by the type of var %s.", "Getting 'lod_levels' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) { switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS: case proto::VarType::SELECTED_ROWS:
return desc_.type().selected_rows(); return desc_.type().selected_rows();
...@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { ...@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarType::LOD_TENSOR_ARRAY: case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().tensor(); return desc_.type().tensor_array().tensor();
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_desc' is not supported by the type of var %s.", "Getting 'tensor_desc' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const { std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc> res; std::vector<proto::VarType::TensorDesc> res;
res.reserve(GetTensorDescNum()); res.reserve(GetTensorDescNum());
switch (desc_.type().type()) { switch (desc_.type().type()) {
...@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const { ...@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
} }
return res; return res;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the type of var " "Getting 'tensor_descs' is not supported by the %s type variable.",
"%s.", this->Name()));
this->Name());
} }
} }
proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) { switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS: case proto::VarType::SELECTED_ROWS:
return desc_.mutable_type()->mutable_selected_rows(); return desc_.mutable_type()->mutable_selected_rows();
...@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { ...@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
default: default:
PADDLE_THROW( PADDLE_THROW(
"Getting 'mutable_tensor_desc' is not supported by the type of var " platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
"%s.", "supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() { std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc *> res; std::vector<proto::VarType::TensorDesc *> res;
res.reserve(GetTensorDescNum()); res.reserve(GetTensorDescNum());
switch (desc_.type().type()) { switch (desc_.type().type()) {
...@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() { ...@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
} }
return res; return res;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the type of var " "Getting 'tensor_descs' is not supported by the %s type variable.",
"%s.", this->Name()));
this->Name());
} }
} }
......
...@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) { ...@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::READER: case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type); return static_cast<proto::VarType::Type>(type);
default: default:
PADDLE_THROW("ToVarType:Unsupported type %d", type); PADDLE_THROW(platform::errors::Unavailable(
"ToVarType method Unsupported type %d.", type));
} }
} }
...@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { ...@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
visitor(var.Get<FetchList>()); visitor(var.Get<FetchList>());
return; return;
default: default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
ToTypeName(var.Type())));
} }
} }
......
...@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl { ...@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
static_assert(!std::is_same<Type, void>::value, "Type cannot be void"); static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
constexpr int kId = VarTypeTrait<Type>::kId; constexpr int kId = VarTypeTrait<Type>::kId;
auto type = std::type_index(typeid(Type)); auto type = std::type_index(typeid(Type));
PADDLE_ENFORCE(id_to_type->count(kId) == 0, PADDLE_ENFORCE_EQ(
"Registered duplicate type id %d for type %s", kId, id_to_type->count(kId), 0,
type.name()); platform::errors::AlreadyExists(
PADDLE_ENFORCE(type_to_id->count(type) == 0, "Registered duplicate type id %d for type %s.", kId, type.name()));
"Registered duplicate type_index %s for id %d", type.name(), PADDLE_ENFORCE_EQ(
kId); type_to_id->count(type), 0,
platform::errors::AlreadyExists(
"Registered duplicate type index %s for id %d.", type.name(), kId));
id_to_type->emplace(kId, type); id_to_type->emplace(kId, type);
type_to_id->emplace(type, kId); type_to_id->emplace(type, kId);
VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd, VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
...@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder { ...@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
public: public:
static const std::type_index &ToTypeIndex(int var_id) { static const std::type_index &ToTypeIndex(int var_id) {
auto it = Instance().id_to_type_map_.find(var_id); auto it = Instance().id_to_type_map_.find(var_id);
PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
"VarId %d is not registered.", var_id); platform::errors::NotFound(
"Variable Id %d is not registered.", var_id));
return it->second; return it->second;
} }
static int ToTypeId(const std::type_index &type) { static int ToTypeId(const std::type_index &type) {
auto it = Instance().type_to_id_map_.find(type); auto it = Instance().type_to_id_map_.find(type);
PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
"VarType %s is not registered.", type.name()); platform::errors::NotFound(
"Variable Type %s is not registered.", type.name()));
return it->second; return it->second;
} }
......
...@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { ...@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::RAW) { } else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator // GetMutable will be called in operator
} else { } else {
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Variable type %d is not in " "Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
var_type); var_type));
} }
} }
...@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) { ...@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
auto *dst_t = tmp_grad_slr->mutable_value(); auto *dst_t = tmp_grad_slr->mutable_value();
framework::TensorCopy(src_t, cpu_place, dst_t); framework::TensorCopy(src_t, cpu_place, dst_t);
} else { } else {
PADDLE_THROW("unknown var type to copy"); PADDLE_THROW(
platform::errors::Unavailable("Unknown variable type to copy."));
} }
} }
......
...@@ -218,6 +218,10 @@ struct Argument { ...@@ -218,6 +218,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
// Only used in paddle-lite subgraph.
DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
int);
private: private:
std::unordered_set<std::string> valid_fields_; std::unordered_set<std::string> valid_fields_;
}; };
......
...@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_xpu", new bool(argument->use_xpu())); pass->Set("use_xpu", new bool(argument->use_xpu()));
pass->Set("xpu_l3_workspace_size", pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size())); new int(argument->xpu_l3_workspace_size()));
pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads()));
} }
disable_logs_ = argument->disable_logs(); disable_logs_ = argument->disable_logs();
if (pass_name == "fc_fuse_pass") { if (pass_name == "fc_fuse_pass") {
......
...@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
bool enable_int8 = Get<bool>("enable_int8"); bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu"); bool use_xpu = Get<bool>("use_xpu");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size"); int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
lite_api::TargetType target_type; lite_api::TargetType target_type;
if (use_gpu) { if (use_gpu) {
...@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
// Notice: The ordering here determines the device where the // Notice: The ordering here determines the device where the
// input tensor of the Lite engine is located, and then affects // input tensor of the Lite engine is located, and then affects
// whether tensor sharing is feasible. // whether tensor sharing is feasible.
paddle::lite::Place({target_type, precision_type}), paddle::lite_api::Place({target_type, precision_type}),
paddle::lite::Place({target_type, PRECISION(kInt64)}), paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
paddle::lite::Place({target_type, PRECISION(kFloat)}), paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
}; };
config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size; config.xpu_l3_workspace_size = xpu_l3_workspace_size;
if (dump_model) { if (dump_model) {
lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./model.bin", config.model);
......
...@@ -53,12 +53,10 @@ if(WITH_TESTING) ...@@ -53,12 +53,10 @@ if(WITH_TESTING)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
elseif(WIN32) elseif(WIN32)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
endif() endif()
endif() endif()
......
...@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() {
} }
if (config_.lite_engine_enabled()) { if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads());
argument_.SetLitePrecisionMode(config_.lite_precision_mode_); argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
argument_.SetLitePassesFilter(config_.lite_passes_filter_); argument_.SetLitePassesFilter(config_.lite_passes_filter_);
argument_.SetLiteOpsFilter(config_.lite_ops_filter_); argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
......
...@@ -21,15 +21,21 @@ ...@@ -21,15 +21,21 @@
namespace paddle { namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) { void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
PADDLE_ENFORCE(!name_.empty(), PADDLE_ENFORCE_EQ(
"Need to SetName first, so that the corresponding tensor can " name_.empty(), false,
"be retrieved."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE(input_or_output_, "Need to SetName first, so that the corresponding tensor can "
"Can't reshape the output tensor, it is readonly"); "be retrieved."));
PADDLE_ENFORCE(scope_); PADDLE_ENFORCE_EQ(input_or_output_, true,
platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly"));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_); auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(shape)); tensor->Resize(framework::make_ddim(shape));
} }
...@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { ...@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
tensor->numel(), 0, tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)" platform::errors::PreconditionNotMet(
"function before retrieving mutable_data from input tensor."); "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
"&shape)"
"function before retrieving mutable_data from input tensor."));
switch (static_cast<int>(place)) { switch (static_cast<int>(place)) {
case static_cast<int>(PaddlePlace::kCPU): { case static_cast<int>(PaddlePlace::kCPU): {
return tensor->mutable_data<T>(platform::CPUPlace()); return tensor->mutable_data<T>(platform::CPUPlace());
...@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { ...@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
return tensor->mutable_data<T>(platform::CUDAPlace(device_)); return tensor->mutable_data<T>(platform::CUDAPlace(device_));
} }
default: default:
PADDLE_THROW("Unsupported place: %d", static_cast<int>(place)); PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
static_cast<int>(place)));
break; break;
} }
return nullptr; return nullptr;
...@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const { ...@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
template <typename T> template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) { void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(tensor->numel(), 0,
tensor->numel(), 0, platform::errors::PreconditionNotMet(
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)" "You should call ZeroCopyTensor::Reshape(const "
"function before copying data from cpu."); "std::vector<int> &shape)"
"function before copying data from cpu."));
size_t ele_size = tensor->numel() * sizeof(T); size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) { if (place_ == PaddlePlace::kCPU) {
...@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { ...@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(), memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream()); data, ele_size, dev_ctx->stream());
#else #else
PADDLE_THROW("Not compiled with CUDA, should not reach here."); PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with CUDA, should not reach here."));
#endif #endif
} }
} }
...@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
cudaStreamSynchronize(dev_ctx->stream()); cudaStreamSynchronize(dev_ctx->stream());
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here."));
#endif #endif
} }
} }
...@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>( ...@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
PaddlePlace place); PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(), PADDLE_ENFORCE_EQ(
"Need to SetName first, so that the corresponding tensor can " name_.empty(), false,
"be retrieved."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE(scope_); "Need to SetName first, so that the corresponding tensor can "
"be retrieved."));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_); auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
return tensor; return tensor;
} }
std::vector<int> ZeroCopyTensor::shape() const { std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); PADDLE_ENFORCE_NOT_NULL(
tensor_, platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_));
return framework::vectorize<int>(tensor->dims()); return framework::vectorize<int>(tensor->dims());
} }
......
...@@ -31,12 +31,30 @@ limitations under the License. */ ...@@ -31,12 +31,30 @@ limitations under the License. */
#include "paddle_analysis_config.h" // NOLINT #include "paddle_analysis_config.h" // NOLINT
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
///
/// \file paddle_inference_api.h
///
/// \brief Paddle Inference API
///
/// \author paddle-infer@baidu.com
/// \date 2020-09-01
/// \since 2.0.0-beta
///
namespace paddle_infer { namespace paddle_infer {
using DataType = paddle::PaddleDType; using DataType = paddle::PaddleDType;
using PlaceType = paddle::PaddlePlace; using PlaceType = paddle::PaddlePlace;
using PrecisionType = paddle::AnalysisConfig::Precision; using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig; using Config = paddle::AnalysisConfig;
///
/// \class Tensor
///
/// \brief Represents an n-dimensional array of values.
/// The Tensor is used to store the input or output of the network.
/// It is obtained through Predictor::GetinputHandle()
/// and Predictor::GetOutputHandle() interface.
///
class PD_INFER_DECL Tensor { class PD_INFER_DECL Tensor {
public: public:
// Can only be created by predictor->GetInputHandle(cosnt std::string& name) // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
...@@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor { ...@@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor {
Tensor() = delete; Tensor() = delete;
explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor) explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
: tensor_(std::move(tensor)) {} : tensor_(std::move(tensor)) {}
///
/// \brief Reset the shape of the tensor.
/// Generally it's only used for the input tensor.
/// Reshape must be called before calling mutable_data() or CopyFromCpu()
/// \param shape The shape to set.
///
void Reshape(const std::vector<int>& shape); void Reshape(const std::vector<int>& shape);
///
/// \brief Copy the host memory to tensor data.
/// It's usually used to set the input tensor data.
/// \param data The pointer of the data, from which the tensor will copy.
///
template <typename T> template <typename T>
void CopyFromCpu(const T* data); void CopyFromCpu(const T* data);
// should add the place ///
/// \brief Get the memory pointer in CPU or GPU with specific data type.
/// Please Reshape the tensor first before call this.
/// It's usually used to get input data pointer.
/// \param place The place of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T> template <typename T>
T* mutable_data(PlaceType place); T* mutable_data(PlaceType place);
///
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
///
template <typename T> template <typename T>
void CopyToCpu(T* data); void CopyToCpu(T* data);
///
/// \brief Get the memory pointer directly.
/// It's usually used to get the output data pointer.
/// \param[out] place To get the device type of the tensor.
/// \param[out] size To get the data size of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T> template <typename T>
T* data(PlaceType* place, int* size) const; T* data(PlaceType* place, int* size) const;
///
/// \brief Set lod info of the tensor.
/// More about LOD can be seen here:
/// https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
/// \param x the lod info.
///
void SetLoD(const std::vector<std::vector<size_t>>& x); void SetLoD(const std::vector<std::vector<size_t>>& x);
/// \brief Return the lod info of the tensor.
std::vector<std::vector<size_t>> lod() const; std::vector<std::vector<size_t>> lod() const;
/// \brief Return the data type of the tensor.
/// It's usually used to get the output tensor data type.
/// \return The data type of the tensor.
DataType type() const; DataType type() const;
/// \brief Return the shape of the Tensor.
std::vector<int> shape() const; std::vector<int> shape() const;
/// \brief Return the name of the tensor.
const std::string& name() const; const std::string& name() const;
private: private:
std::unique_ptr<paddle::ZeroCopyTensor> tensor_; std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
}; };
///
/// \class Predictor
///
/// \brief Predictor is the interface for model prediction.
///
/// The predictor has the following typical uses:
///
/// Get predictor
/// \code{cpp}
/// auto predictor = CreatePredictor(config);
/// \endcode
///
/// Get input or output names
/// \code{cpp}
/// auto input_names = predictor->GetInputNames();
/// auto output_names = predictor->GetOutputNames();
/// \endcode
///
/// Get input or output handle
/// \code{cpp}
/// auto input_t = predictor->GetInputHandle(input_names[0]);
/// auto output_t = predictor->GetOutputHandle(output_names[0]);
/// \endcode
///
/// Run predictor
/// \code{cpp}
/// predictor->Run();
/// \endcode
///
class PD_INFER_DECL Predictor { class PD_INFER_DECL Predictor {
public: public:
Predictor() = delete; Predictor() = delete;
...@@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor { ...@@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor {
explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred) explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
: predictor_(std::move(pred)) {} : predictor_(std::move(pred)) {}
///
/// \brief Construct a new Predictor object
///
/// \param[in] Config config
///
explicit Predictor(const Config& config); explicit Predictor(const Config& config);
///
/// \brief Get the input names
///
/// \return input names
///
std::vector<std::string> GetInputNames(); std::vector<std::string> GetInputNames();
///
/// \brief Get the Input Tensor object
///
/// \param[in] name input name
/// \return input tensor
///
std::unique_ptr<Tensor> GetInputHandle(const std::string& name); std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
///
/// \brief Run the prediction engine
///
/// \return Whether the function executed successfully
///
bool Run(); bool Run();
///
/// \brief Get the output names
///
/// \return output names
///
std::vector<std::string> GetOutputNames(); std::vector<std::string> GetOutputNames();
///
/// \brief Get the Output Tensor object
///
/// \param[in] name otuput name
/// \return output tensor
///
std::unique_ptr<Tensor> GetOutputHandle(const std::string& name); std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
///
/// \brief Clone to get the new predictor. thread safe.
///
/// \return get a new predictor
///
std::unique_ptr<Predictor> Clone(); std::unique_ptr<Predictor> Clone();
/// \brief Clear the intermediate tensors of the predictor
void ClearIntermediateTensor(); void ClearIntermediateTensor();
private: private:
std::unique_ptr<paddle::PaddlePredictor> predictor_; std::unique_ptr<paddle::PaddlePredictor> predictor_;
}; };
///
/// \brief A factory to help create predictors.
///
/// Usage:
///
/// \code{.cpp}
/// Config config;
/// ... // change the configs.
/// auto predictor = CreatePredictor(config);
/// \endcode
///
PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor( PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
const Config& config); // NOLINT const Config& config); // NOLINT
PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype); PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
PD_INFER_DECL std::string GetVersion(); PD_INFER_DECL std::string GetVersion();
...@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const { ...@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
namespace paddle_infer { namespace paddle_infer {
namespace services { namespace services {
///
/// \class PredictorPool
///
/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
/// use in multi-threaded situations. According to the thread id, the
/// corresponding Predictor is taken out from PredictorPool to complete the
/// prediction.
///
class PD_INFER_DECL PredictorPool { class PD_INFER_DECL PredictorPool {
public: public:
PredictorPool() = delete; PredictorPool() = delete;
PredictorPool(const PredictorPool&) = delete; PredictorPool(const PredictorPool&) = delete;
PredictorPool& operator=(const PredictorPool&) = delete; PredictorPool& operator=(const PredictorPool&) = delete;
/// \brief Construct the predictor pool with \param size predictor instances.
explicit PredictorPool(const Config& config, size_t size = 1); explicit PredictorPool(const Config& config, size_t size = 1);
/// \brief Get \param id-th predictor.
Predictor* Retrive(size_t idx); Predictor* Retrive(size_t idx);
private: private:
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) { ...@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
} }
void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) { void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Resize(length); buf->buf.Resize(length);
} }
void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) { void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Reset(data, length); buf->buf.Reset(data, length);
} }
bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) { bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.empty(); return buf->buf.empty();
} }
void* PD_PaddleBufData(PD_PaddleBuf* buf) { void* PD_PaddleBufData(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.data(); return buf->buf.data();
} }
size_t PD_PaddleBufLength(PD_PaddleBuf* buf) { size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.length(); return buf->buf.length();
} }
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using PD_PaddleDType = paddle::PaddleDType; using PD_PaddleDType = paddle::PaddleDType;
using PD_ACPrecision = paddle::AnalysisConfig::Precision; using PD_ACPrecision = paddle::AnalysisConfig::Precision;
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) { ...@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir, void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
const char* params_path) { const char* params_path) {
LOG(INFO) << model_dir; LOG(INFO) << model_dir;
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
LOG(INFO) << std::string(model_dir); LOG(INFO) << std::string(model_dir);
if (!params_path) { if (!params_path) {
config->config.SetModel(std::string(model_dir)); config->config.SetModel(std::string(model_dir));
...@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir, ...@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
} }
void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) { void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetProgFile(std::string(x)); config->config.SetProgFile(std::string(x));
} }
void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) { void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetParamsFile(std::string(x)); config->config.SetParamsFile(std::string(x));
} }
void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) { void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetOptimCacheDir(std::string(opt_cache_dir)); config->config.SetOptimCacheDir(std::string(opt_cache_dir));
} }
const char* PD_ModelDir(const PD_AnalysisConfig* config) { const char* PD_ModelDir(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_dir().c_str(); return config->config.model_dir().c_str();
} }
const char* PD_ProgFile(const PD_AnalysisConfig* config) { const char* PD_ProgFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.prog_file().c_str(); return config->config.prog_file().c_str();
} }
const char* PD_ParamsFile(const PD_AnalysisConfig* config) { const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.params_file().c_str(); return config->config.params_file().c_str();
} }
void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb, void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
int device_id) { int device_id) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb), config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
device_id); device_id);
} }
void PD_DisableGpu(PD_AnalysisConfig* config) { void PD_DisableGpu(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.DisableGpu(); config->config.DisableGpu();
} }
bool PD_UseGpu(const PD_AnalysisConfig* config) { bool PD_UseGpu(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_gpu(); return config->config.use_gpu();
} }
int PD_GpuDeviceId(const PD_AnalysisConfig* config) { int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.gpu_device_id(); return config->config.gpu_device_id();
} }
int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) { int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.memory_pool_init_size_mb(); return config->config.memory_pool_init_size_mb();
} }
float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) { float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.fraction_of_gpu_memory_for_pool(); return config->config.fraction_of_gpu_memory_for_pool();
} }
void PD_EnableCUDNN(PD_AnalysisConfig* config) { void PD_EnableCUDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableCUDNN(); config->config.EnableCUDNN();
} }
bool PD_CudnnEnabled(const PD_AnalysisConfig* config) { bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cudnn_enabled(); return config->config.cudnn_enabled();
} }
void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) { void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrOptim(x); config->config.SwitchIrOptim(x);
} }
bool PD_IrOptim(const PD_AnalysisConfig* config) { bool PD_IrOptim(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.ir_optim(); return config->config.ir_optim();
} }
void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) { void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchUseFeedFetchOps(x); config->config.SwitchUseFeedFetchOps(x);
} }
bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) { bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_feed_fetch_ops_enabled(); return config->config.use_feed_fetch_ops_enabled();
} }
void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) { void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchSpecifyInputNames(x); config->config.SwitchSpecifyInputNames(x);
} }
bool PD_SpecifyInputName(const PD_AnalysisConfig* config) { bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.specify_input_name(); return config->config.specify_input_name();
} }
...@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size, ...@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
int max_batch_size, int min_subgraph_size, int max_batch_size, int min_subgraph_size,
Precision precision, bool use_static, Precision precision, bool use_static,
bool use_calib_mode) { bool use_calib_mode) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableTensorRtEngine( config->config.EnableTensorRtEngine(
workspace_size, max_batch_size, min_subgraph_size, workspace_size, max_batch_size, min_subgraph_size,
paddle::ConvertToACPrecision(precision), use_static, use_calib_mode); paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
} }
bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.tensorrt_engine_enabled(); return config->config.tensorrt_engine_enabled();
} }
void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrDebug(x); config->config.SwitchIrDebug(x);
} }
void PD_EnableMKLDNN(PD_AnalysisConfig* config) { void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMKLDNN(); config->config.EnableMKLDNN();
} }
void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) { void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetMkldnnCacheCapacity(capacity); config->config.SetMkldnnCacheCapacity(capacity);
} }
bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) { bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_enabled(); return config->config.mkldnn_enabled();
} }
void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config, void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
int cpu_math_library_num_threads) { int cpu_math_library_num_threads) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads); config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
} }
int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) { int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cpu_math_library_num_threads(); return config->config.cpu_math_library_num_threads();
} }
void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) { void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnQuantizer(); config->config.EnableMkldnnQuantizer();
} }
bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) { bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_quantizer_enabled(); return config->config.mkldnn_quantizer_enabled();
} }
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) { void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound( PADDLE_ENFORCE_NOT_NULL(
"PD_AnalysisConfig should not be null")); config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnBfloat16(); config->config.EnableMkldnnBfloat16();
} }
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) { bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound( PADDLE_ENFORCE_NOT_NULL(
"PD_AnalysisConfig should not be null")); config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_bfloat16_enabled(); return config->config.mkldnn_bfloat16_enabled();
} }
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer, void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer, size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) { size_t params_buffer_size) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer, config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
params_buffer_size); params_buffer_size);
} }
bool PD_ModelFromMemory(const PD_AnalysisConfig* config) { bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_from_memory(); return config->config.model_from_memory();
} }
void PD_EnableMemoryOptim(PD_AnalysisConfig* config) { void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMemoryOptim(); config->config.EnableMemoryOptim();
} }
bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) { bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.enable_memory_optim(); return config->config.enable_memory_optim();
} }
void PD_EnableProfile(PD_AnalysisConfig* config) { void PD_EnableProfile(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableProfile(); config->config.EnableProfile();
} }
bool PD_ProfileEnabled(const PD_AnalysisConfig* config) { bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.profile_enabled(); return config->config.profile_enabled();
} }
void PD_SetInValid(PD_AnalysisConfig* config) { void PD_SetInValid(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetInValid(); config->config.SetInValid();
} }
bool PD_IsValid(const PD_AnalysisConfig* config) { bool PD_IsValid(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.is_valid(); return config->config.is_valid();
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -81,7 +82,10 @@ extern "C" { ...@@ -81,7 +82,10 @@ extern "C" {
bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs, bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
int in_size, PD_Tensor** output_data, int* out_size, int in_size, PD_Tensor** output_data, int* out_size,
int batch_size) { int batch_size) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
VLOG(3) << "Predoctor: PD_PredictorRun. "; VLOG(3) << "Predoctor: PD_PredictorRun. ";
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>> static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors; predictors;
...@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs, ...@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
PD_ZeroCopyData* inputs, int in_size, PD_ZeroCopyData* inputs, int in_size,
PD_ZeroCopyData** output, int* out_size) { PD_ZeroCopyData** output, int* out_size) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>> static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors; predictors;
if (!predictors.count(config->config.model_dir())) { if (!predictors.count(config->config.model_dir())) {
...@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, ...@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data)); input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
break; break;
default: default:
CHECK(false) << "Unsupport data type."; PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unsupported data type."));
break; break;
} }
} }
...@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor, ...@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data)); input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
break; break;
default: default:
CHECK(false) << "Unsupport data type."; PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break; break;
} }
...@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) { ...@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data)); output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
break; break;
default: default:
CHECK(false) << "Unsupport data type."; PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break; break;
} }
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) { ...@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
} }
void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) { void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.name = std::string(name); tensor->tensor.name = std::string(name);
} }
void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) { void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype); tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
} }
void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) { void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.data = buf->buf; tensor->tensor.data = buf->buf;
} }
void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) { void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.shape.assign(shape, shape + size); tensor->tensor.shape.assign(shape, shape + size);
} }
const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) { const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return tensor->tensor.name.c_str(); return tensor->tensor.name.c_str();
} }
PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) { PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return ConvertToPDDataType(tensor->tensor.dtype); return ConvertToPDDataType(tensor->tensor.dtype);
} }
PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) { PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
PD_PaddleBuf* ret = PD_NewPaddleBuf(); PD_PaddleBuf* ret = PD_NewPaddleBuf();
ret->buf = tensor->tensor.data; ret->buf = tensor->tensor.data;
return ret; return ret;
} }
const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) { const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
const std::vector<int>& shape = tensor->tensor.shape; const std::vector<int>& shape = tensor->tensor.shape;
*size = shape.size(); *size = shape.size();
return shape.data(); return shape.data();
......
...@@ -20,8 +20,12 @@ ...@@ -20,8 +20,12 @@
#define LITE_WITH_XPU 1 #define LITE_WITH_XPU 1
#endif #endif
#ifndef PADDLE_WITH_ARM
#define LITE_WITH_X86 1
#endif
#include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/engine.h"
#include "lite/api/paddle_use_passes.h" #include <utility>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const { ...@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
return engines_.at(name).get() != nullptr; return engines_.at(name).get() != nullptr;
} }
paddle::lite::Predictor* EngineManager::Get(const std::string& name) const { paddle::lite_api::PaddlePredictor* EngineManager::Get(
const std::string& name) const {
return engines_.at(name).get(); return engines_.at(name).get();
} }
paddle::lite::Predictor* EngineManager::Create(const std::string& name, paddle::lite_api::PaddlePredictor* EngineManager::Create(
const EngineConfig& cfg) { const std::string& name, const EngineConfig& cfg) {
if (cfg.valid_places.front().target == TARGET(kCUDA)) { // config info for predictor.
#ifdef PADDLE_WITH_CUDA paddle::lite_api::CxxConfig lite_cxx_config;
paddle::lite::Env<TARGET(kCUDA)>::Init(); lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
cfg.param.c_str(), cfg.param.size());
lite_cxx_config.set_valid_places(cfg.valid_places);
#ifdef PADDLE_WITH_ARM
set_threads.set_threads(cfg.cpu_math_library_num_threads);
#else
lite_cxx_config.set_x86_math_library_num_threads(
cfg.cpu_math_library_num_threads);
#endif #endif
} else if (cfg.valid_places.front().target == TARGET(kXPU)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread = lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
cfg.xpu_l3_workspace_size; cfg.xpu_l3_workspace_size);
#endif #endif
}
auto* p = new paddle::lite::Predictor(); // create predictor
p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes, std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
cfg.model_type, cfg.model_from_memory); paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
engines_[name].reset(p); engines_[name] = std::move(p);
return p; return engines_[name].get();
} }
void EngineManager::DeleteAll() { void EngineManager::DeleteAll() {
for (auto& item : engines_) { for (auto& item : engines_) {
item.second.reset(nullptr); item.second.reset();
} }
} }
......
...@@ -23,12 +23,9 @@ ...@@ -23,12 +23,9 @@
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wall" #pragma GCC diagnostic ignored "-Wall"
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/core/context.h" #include "lite/api/paddle_use_passes.h"
#include "lite/core/device_info.h"
#include "lite/core/memory.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
namespace paddle { namespace paddle {
...@@ -38,25 +35,33 @@ namespace lite { ...@@ -38,25 +35,33 @@ namespace lite {
struct EngineConfig { struct EngineConfig {
std::string model; std::string model;
std::string param; std::string param;
paddle::lite::Place prefer_place; std::vector<paddle::lite_api::Place> valid_places;
std::vector<paddle::lite::Place> valid_places;
std::vector<std::string> neglected_passes; std::vector<std::string> neglected_passes;
lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
bool model_from_memory{true}; bool model_from_memory{true};
// for xpu
size_t xpu_l3_workspace_size; size_t xpu_l3_workspace_size;
// for x86 or arm
int cpu_math_library_num_threads{1};
// for cuda
bool use_multi_stream{false};
}; };
class EngineManager { class EngineManager {
public: public:
bool Empty() const; bool Empty() const;
bool Has(const std::string& name) const; bool Has(const std::string& name) const;
paddle::lite::Predictor* Get(const std::string& name) const; paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
paddle::lite::Predictor* Create(const std::string& name, paddle::lite_api::PaddlePredictor* Create(const std::string& name,
const EngineConfig& cfg); const EngineConfig& cfg);
void DeleteAll(); void DeleteAll();
private: private:
std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>> std::unordered_map<std::string,
std::shared_ptr<paddle::lite_api::PaddlePredictor>>
engines_; engines_;
}; };
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/lite/tensor_utils.h" #include "paddle/fluid/inference/lite/tensor_utils.h"
#include <functional>
#include <map> #include <map>
#include <memory> #include <memory>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, ...@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
} }
} }
void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) { void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
PrecisionType precision_type,
TargetType target_type) {
void* res{nullptr};
switch (precision_type) {
case PrecisionType::kFloat:
res = static_cast<void*>(src->mutable_data<float>(target_type));
break;
case PrecisionType::kInt8:
res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
break;
case PrecisionType::kInt32:
res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
break;
case PrecisionType::kInt64:
res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported precision type. Now only supports FP32, INT8, INT32 and "
"INT64."));
break;
}
return res;
}
int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
auto shape = tensor.shape();
int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
return numel;
}
void InitDstTensor(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src) {
// Currently, Lite needs to explicitly specify the target type of // Currently, Lite needs to explicitly specify the target type of
// the input tensor. // the input tensor.
constexpr int empty_size = 0; constexpr int empty_size = 0;
dst->mutable_data(GetLiteTargetType(src.place()), empty_size); dst->Resize({empty_size});
dst->set_precision(GetLitePrecisionType(src.type())); GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
SetLoD(dst->mutable_lod(), src.lod()); GetLiteTargetType(src.place()));
dst->SetPrecision(GetLitePrecisionType(src.type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src.lod());
dst->SetLoD(lite_lod);
} }
void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { void InitDstTensor(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src) {
constexpr framework::proto::VarType::Type dtype = constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32; framework::proto::VarType_Type_FP32;
dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()), dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
...@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { ...@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
} }
template <> template <>
void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, void TensorCopyAsync(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src,
const platform::DeviceContext& ctx) { const platform::DeviceContext& ctx) {
InitDstTensor(dst, src); InitDstTensor(dst, src);
const platform::Place& src_place = src.place(); const platform::Place& src_place = src.place();
...@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, ...@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type()); static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
dst->Resize(framework::vectorize(src.dims())); dst->Resize(framework::vectorize(src.dims()));
const void* src_data = src.data<void>(); const void* src_data = src.data<void>();
void* dst_data = dst->mutable_data(bytes); void* dst_data{nullptr};
dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
GetLiteTargetType(src.place()));
VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << src.type(); << ", dst = " << dst << ", src_type = " << src.type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size(); VLOG(3) << "[Lite memory size] Bytes = " << bytes;
} }
template <> template <>
void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src, void TensorCopyAsync(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src,
const platform::DeviceContext& ctx) { const platform::DeviceContext& ctx) {
dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize())); dst->Resize(paddle::framework::make_ddim(src.shape()));
InitDstTensor(dst, src); InitDstTensor(dst, src);
const platform::Place& src_place = GetNativePlace(src.target()); const platform::Place& src_place = GetNativePlace(src.target());
const platform::Place& dst_place = dst->place(); const platform::Place& dst_place = dst->place();
const size_t bytes = int64_t src_numel = GetLiteTensorNumel(src);
static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type()); const size_t bytes = src_numel * framework::SizeOfType(dst->type());
const void* src_data = src.raw_data(); const void* src_data = src.data<void>();
// When Lite is ready, the source type needs to be modified here. // When Lite is ready, the source type needs to be modified here.
void* dst_data = dst->mutable_data(dst_place, dst->type()); void* dst_data = dst->mutable_data(dst_place, dst->type());
VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << dst->type(); << ", dst = " << dst << ", src_type = " << dst->type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size(); VLOG(3) << "[Lite memory size] Bytes = " << bytes;
} }
template <> template <>
void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) { void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
const size_t bytes =
static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
dst->Resize(framework::vectorize(src->dims())); dst->Resize(framework::vectorize(src->dims()));
dst->set_precision(GetLitePrecisionType(src->type())); dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
SetLoD(dst->mutable_lod(), src->lod()); GetLiteTargetType(src->place()));
dst->ResetBuffer(buf, bytes); dst->SetPrecision(GetLitePrecisionType(src->type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src->lod());
dst->SetLoD(lite_lod);
} }
template <> template <>
void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) { void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
constexpr framework::proto::VarType::Type dtype = constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32; framework::proto::VarType_Type_FP32;
void* src_raw_data = src->raw_data(); void* src_raw_data =
GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
std::shared_ptr<memory::allocation::Allocation> holder( std::shared_ptr<memory::allocation::Allocation> holder(
new memory::allocation::Allocation(src_raw_data, src->memory_size(), new memory::allocation::Allocation(src_raw_data, memory_size,
GetNativePlace(src->target()))); GetNativePlace(src->target())));
dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize())); dst->Resize(paddle::framework::make_ddim(src->shape()));
SetLoD(dst->mutable_lod(), src->lod()); SetLoD(dst->mutable_lod(), src->lod());
dst->ResetHolderWithType(holder, dtype); dst->ResetHolderWithType(holder, dtype);
} }
......
...@@ -102,10 +102,10 @@ TEST(EngineManager, engine) { ...@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
config.model_from_memory = true; config.model_from_memory = true;
config.valid_places = { config.valid_places = {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif #endif
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
}; };
LOG(INFO) << "Create EngineManager"; LOG(INFO) << "Create EngineManager";
...@@ -118,7 +118,7 @@ TEST(EngineManager, engine) { ...@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has( ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
unique_key), unique_key),
true); true);
paddle::lite::Predictor* engine_0 = paddle::lite_api::PaddlePredictor* engine_0 =
inference::Singleton<inference::lite::EngineManager>::Global().Get( inference::Singleton<inference::lite::EngineManager>::Global().Get(
unique_key); unique_key);
CHECK_NOTNULL(engine_0); CHECK_NOTNULL(engine_0);
......
...@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) { ...@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC)); EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
} }
template <typename T>
void test_lite_tensor_data_ptr(PrecisionType precision_type) {
void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
PrecisionType precision_type,
TargetType target_type);
const int count = 4;
paddle::lite::Tensor lite_tensor;
lite_tensor.Resize({count});
auto* lite_tensor_data = lite_tensor.mutable_data<T>();
for (size_t i = 0; i < count; ++i) {
lite_tensor_data[i] = i;
}
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
T* data = static_cast<T*>(GetLiteTensorDataPtr(
&lite_api_tensor, precision_type, TargetType::kHost));
for (size_t i = 0; i < count; ++i) {
CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
}
}
TEST(LiteEngineOp, GetLiteTensorDataPtr) {
test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
}
void test_tensor_copy(const platform::DeviceContext& ctx) { void test_tensor_copy(const platform::DeviceContext& ctx) {
// Create LoDTensor. // Create LoDTensor.
std::vector<float> vector({1, 2, 3, 4}); std::vector<float> vector({1, 2, 3, 4});
...@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { ...@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
// Create lite::Tensor and copy. // Create lite::Tensor and copy.
paddle::lite::Tensor lite_tensor; paddle::lite::Tensor lite_tensor;
TensorCopyAsync(&lite_tensor, lod_tensor, ctx); paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
// Copy to LoDTensor. // Copy to LoDTensor.
framework::LoDTensor lod_tensor_n; framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync( platform::GpuStreamSync(
...@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) { ...@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
// Create lite::Tensor and share. // Create lite::Tensor and share.
paddle::lite::Tensor lite_tensor; paddle::lite::Tensor lite_tensor;
TensorDataShare(&lite_tensor, &lod_tensor); paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorDataShare(&lite_api_tensor, &lod_tensor);
// Copy to LoDTensor. // Copy to LoDTensor.
framework::LoDTensor lod_tensor_n; framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
std::vector<float> result; std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result); TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector); ASSERT_EQ(result, vector);
......
...@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers, ...@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
void TensorRTEngine::FreezeNetwork() { void TensorRTEngine::FreezeNetwork() {
freshDeviceId(); freshDeviceId();
VLOG(3) << "TRT to freeze network"; VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE(infer_builder_ != nullptr, PADDLE_ENFORCE_NOT_NULL(infer_builder_,
"Call InitNetwork first to initialize network."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(network() != nullptr, true, "Inference builder of TRT is null. Please make "
platform::errors::InvalidArgument( "sure you call InitNetwork first."));
"Call InitNetwork first to initialize network.")); PADDLE_ENFORCE_NOT_NULL(network(),
platform::errors::InvalidArgument(
"Call InitNetwork first to initialize network."));
// build engine. // build engine.
infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxBatchSize(max_batch_);
infer_builder_->setMaxWorkspaceSize(max_workspace_); infer_builder_->setMaxWorkspaceSize(max_workspace_);
...@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
} else { } else {
infer_engine_.reset(infer_builder_->buildCudaEngine(*network())); infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
} }
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); PADDLE_ENFORCE_NOT_NULL(
infer_engine_, platform::errors::Fatal(
"Build TensorRT cuda engine failed! Please recheck "
"you configurations related to paddle-TensorRT."));
} }
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
...@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, ...@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The TRT network should be initialized first.")); "The TRT network should be initialized first."));
auto *input = network()->addInput(name.c_str(), dtype, dims); auto *input = network()->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE(input->isNetworkInput()); input, platform::errors::InvalidArgument("Adding input %s failed in "
"TensorRT inference network. "
"Please recheck your input.",
name));
PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
platform::errors::InvalidArgument(
"Input %s is not the input of TRT inference network. "
"Please recheck your input.",
name));
TensorRTEngine::SetITensor(name, input); TensorRTEngine::SetITensor(name, input);
return input; return input;
} }
...@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, ...@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string &name) { const std::string &name) {
auto *output = layer->getOutput(offset); auto *output = layer->getOutput(offset);
SetITensor(name, output); SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr); PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str()); output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput()); PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output); network()->markOutput(*output);
PADDLE_ENFORCE(output->isNetworkOutput()); PADDLE_ENFORCE_EQ(
output->isNetworkOutput(), true,
platform::errors::InvalidArgument(
"The output %s of TRT engine should be the output of the network.",
name));
} }
void TensorRTEngine::DeclareOutput(const std::string &name) { void TensorRTEngine::DeclareOutput(const std::string &name) {
auto *output = TensorRTEngine::GetITensor(name); auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr); PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str()); output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput()); PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output); network()->markOutput(*output);
} }
void TensorRTEngine::SetITensor(const std::string &name, void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) { nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", tensor, platform::errors::InvalidArgument(
name); "Tensor named %s of TRT engine should not be null.", name));
PADDLE_ENFORCE_EQ(
0, itensor_map_.count(name),
platform::errors::InvalidArgument(
"Tensor named %s of TRT engine should not be duplicated", name));
itensor_map_[name] = tensor; itensor_map_[name] = tensor;
} }
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
platform::errors::NotFound(
"Tensor named %s is not found in TRT engine", name));
return itensor_map_[name]; return itensor_map_[name];
} }
...@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, ...@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
std::string splitter = "__"; std::string splitter = "__";
std::string name_with_suffix = name + splitter + name_suffix; std::string name_with_suffix = name + splitter + name_suffix;
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
weight_map.count(name_with_suffix), 0, platform::errors::AlreadyExists(
"During TRT Op converter: We set weight %s with the same name " "The weight named %s is set into the weight map "
"twice into the weight_map", "twice in TRT OP converter.",
name_with_suffix); name_with_suffix));
weight_map[name_with_suffix].reset(new framework::Tensor()); weight_map[name_with_suffix].reset(new framework::Tensor());
weight_map[name_with_suffix]->Resize(weight_tensor->dims()); weight_map[name_with_suffix]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get()); TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
...@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( ...@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
void TensorRTEngine::freshDeviceId() { void TensorRTEngine::freshDeviceId() {
int count; int count;
cudaGetDeviceCount(&count); cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, count); PADDLE_ENFORCE_LT(device_id_, count,
platform::errors::OutOfRange(
"Device id %d exceeds the current device count: %d.",
device_id_, count));
cudaSetDevice(device_id_); cudaSetDevice(device_id_);
} }
......
...@@ -196,8 +196,10 @@ class TensorRTEngine { ...@@ -196,8 +196,10 @@ class TensorRTEngine {
} }
nvinfer1::IHostMemory* Serialize() { nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE(infer_engine_ != nullptr, PADDLE_ENFORCE_NOT_NULL(
"You should build engine first and then serialize"); infer_engine_,
platform::errors::InvalidArgument(
"The TensorRT engine must be built first before serialization"));
ihost_memory_.reset(infer_engine_->serialize()); ihost_memory_.reset(infer_engine_->serialize());
return ihost_memory_.get(); return ihost_memory_.get();
} }
...@@ -222,8 +224,14 @@ class TensorRTEngine { ...@@ -222,8 +224,14 @@ class TensorRTEngine {
engine_serialized_data.c_str(), engine_serialized_data.size(), engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global())); &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
} }
PADDLE_ENFORCE(infer_engine_ != nullptr, PADDLE_ENFORCE_NOT_NULL(
"build cuda engine failed when deserialize engine info.!"); infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
} }
void SetRuntimeBatch(size_t batch_size); void SetRuntimeBatch(size_t batch_size);
......
...@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data, ...@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
nvinfer1::Dims ElementWisePlugin::getOutputDimensions( nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims *input_dims, int num_inputs) { int index, const nvinfer1::Dims *input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0); PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(num_inputs, 2); "There is only one output in TRT elementwise "
PADDLE_ENFORCE_NOT_NULL(input_dims); "op plugin, but got output index: %d.",
index));
PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
"There are 2 inputs in TRT elementwise "
"op plugin, but got input number: %d.",
num_inputs));
PADDLE_ENFORCE_NOT_NULL(
input_dims,
platform::errors::InvalidArgument(
"The input dims of TRT elementwise op plugin should not be null."));
return input_dims[0]; return input_dims[0];
} }
int ElementWisePlugin::initialize() { int ElementWisePlugin::initialize() {
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0); PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
platform::errors::InvalidArgument(
"The dimension of input Y of TRT elementwise op plugin "
"should be greater than 0, but got %d.",
dims_y_.nbDims));
axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
int trimed_nb_dims = dims_y_.nbDims; int trimed_nb_dims = dims_y_.nbDims;
...@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() { ...@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
} }
dims_y_.nbDims = trimed_nb_dims; dims_y_.nbDims = trimed_nb_dims;
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_); PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims); platform::errors::InvalidArgument(
"We expect [number of x dims] >= [number of y dims + "
"axis] in TRT elementwise op plugin, but got [number "
"of x dims] = %d, [number of y dims + axis] = %d.",
dims_x_.nbDims, dims_y_.nbDims + axis_));
PADDLE_ENFORCE_LT(
axis_, dims_x_.nbDims,
platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
"in TRT elementwise op plugin, but got "
"[axis] = %d, [number of x dims] = %d.",
axis_, dims_x_.nbDims));
prev_size_ = 1; prev_size_ = 1;
midd_size_ = 1; midd_size_ = 1;
...@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() { ...@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
for (int i = 0; i < dims_y_.nbDims; ++i) { for (int i = 0; i < dims_y_.nbDims; ++i) {
PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i], PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
"Broadcast dimension mismatch."); platform::errors::InvalidArgument(
"Broadcast dimension mismatch. The dims of input Y "
"should be a subsequence of X."));
midd_size_ *= dims_y_.d[i]; midd_size_ *= dims_y_.d[i];
} }
...@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue( ...@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
elementwise_kernel<<<block, thread, 0, stream>>>( elementwise_kernel<<<block, thread, 0, stream>>>(
num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>()); num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
} else { } else {
PADDLE_THROW("Not implemented."); PADDLE_THROW(platform::errors::Unimplemented(
"Paddle-TRT only support elementwise operation: {add, mul} currently, "
"but got %s.",
type_));
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
......
...@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
nvinfer1::DimsCHW{1, 1, 1}); nvinfer1::DimsCHW{1, 1, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y"); engine_->DeclareOutput(fc_layer, 0, "y");
LOG(INFO) << "freeze network"; LOG(INFO) << "freeze network";
...@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
nvinfer1::DimsCHW{1, 2, 1}); nvinfer1::DimsCHW{1, 2, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y"); engine_->DeclareOutput(fc_layer, 0, "y");
engine_->FreezeNetwork(); engine_->FreezeNetwork();
...@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) { ...@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
auto *conv_layer = auto *conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(conv_layer,
platform::errors::InvalidArgument(
"TRT convolution layer building failed."));
conv_layer->setStride(nvinfer1::DimsHW{1, 1}); conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
...@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) { ...@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
nvinfer1::DimsHW{2, 2}); nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(
pool_layer,
platform::errors::InvalidArgument("TRT pooling layer building failed."));
pool_layer->setStride(nvinfer1::DimsHW{1, 1}); pool_layer->setStride(nvinfer1::DimsHW{1, 1});
pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
......
...@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch( ...@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
engine_name_, it.first)); engine_name_, it.first));
} }
const auto& d = dataptr->second; const auto& d = dataptr->second;
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
"Fail to cudaMemcpy %s for %s", engine_name_, it.first);
} }
data_is_set_ = true; data_is_set_ = true;
......
...@@ -342,9 +342,9 @@ if(WITH_MKLDNN) ...@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
### Lexcial analysis GRU model ### Lexcial analysis GRU model
set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru") set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz") download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz") download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin") set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model") set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis") set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc") set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
......
...@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
std::vector<PaddleTensor> inputs; std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
......
...@@ -45,7 +45,9 @@ endif() ...@@ -45,7 +45,9 @@ endif()
SET(OP_HEADER_DEPS xxhash executor) SET(OP_HEADER_DEPS xxhash executor)
if (WITH_GPU) if (WITH_GPU)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
endif()
endif() endif()
SET(OP_PREFETCH_DEPS "") SET(OP_PREFETCH_DEPS "")
......
...@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/framework/tensor_util.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel { class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
public: public:
AmpCheckFiniteAndScaleOp(const std::string &type, CheckFiniteAndUnscaleOp(const std::string& type,
const framework::VariableNameMap &inputs, const framework::VariableNameMap& inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap &attrs) const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
"amp_check_finite_and_unscale"); "check_finite_and_unscale");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"amp_check_finite_and_unscale"); "check_finite_and_unscale");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->Inputs("X").size(), ctx->Outputs("Out").size(), ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The input(X) and output(Out) should have same size in " "The input(X) and output(Out) should have same size in "
"Operator(amp_check_finite_and_unscale), size of input(X) is %d " "Operator(check_finite_and_unscale), size of input(X) is %d "
"and size of output(Out) is %d.", "and size of output(Out) is %d.",
ctx->Inputs("X").size(), ctx->Outputs("Out").size())); ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
...@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel { ...@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
} }
}; };
class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker { class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput( AddInput(
"X", "X",
"(Tensors) The input tensors of amp_check_finite_and_scale operator.") "(Tensors) The input tensors of check_finite_and_unscale operator.")
.AsDuplicable(); .AsDuplicable();
AddInput("Scale", AddInput("Scale",
"(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale " "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator."); "operator.");
AddOutput("Out", AddOutput("Out",
"(Tensors) The scaled output tensor of " "(Tensors) The scaled output tensor of "
"amp_check_finite_and_unscale operator.") "check_finite_and_unscale operator.")
.AsDuplicable(); .AsDuplicable();
AddOutput("FoundInfinite", AddOutput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates " "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"if there there is infinite or nan item in input X."); "if there there is infinite or nan item in input X.");
AddComment(R"DOC( AddComment(R"DOC(
amp_check_finite_and_scale operator. check_finite_and_unscale operator.
Check if input X contains all finite data, if yes, scale it by input Scale. Check if input X contains all finite data, if yes, scale it by input Scale.
$$Out = X * scale$$ $$Out = X / scale$$
If any tensor in X contains Inf or Nan, the Out will generate a indicator. If any tensor in X contains Inf or Nan, the Out will generate a indicator.
FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
...@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False). ...@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False).
} }
}; };
template <typename T>
class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
T inverse_scale = Inverse<T>(*scale_data);
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
*found_inf_data = !(*is_finite_data);
}
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
if (!(*found_inf_data)) {
eigen_out.device(dev) = eigen_in * inverse_scale;
} else {
eigen_out.device(dev) = eigen_in * static_cast<T>(0);
}
}
return;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR( REGISTER_OPERATOR(
amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp, check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
ops::AmpCheckFiniteAndScaleOpMaker, ops::CheckFiniteAndUnscaleOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
amp_check_finite_and_scale, ops::CheckFiniteAndUnscaleCpuKernel<float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext, ops::CheckFiniteAndUnscaleCpuKernel<double>);
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
double>);
...@@ -14,28 +14,31 @@ limitations under the License. */ ...@@ -14,28 +14,31 @@ limitations under the License. */
#include <cuda.h> #include <cuda.h>
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num, __global__ void GpuInverse(const T* s, T* o) {
bool* found_inf, T* out) { *o = Inverse<T>(*s);
}
template <typename T>
__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
bool* found_inf, T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) { if (idx < num) {
if (!isfinite(in[idx])) { if (!isfinite(in[idx])) {
*found_inf = 1; *found_inf = true;
} }
out[idx] = *found_inf ? in[idx] : in[idx] * scale[0]; out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
} }
} }
template <typename T> template <typename T>
class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
: public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
...@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> ...@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace()); bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool)); cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
framework::Tensor inverse_scale =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
T* inverse_scale_v = inverse_scale.template data<T>();
GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
for (size_t i = 0; i < xs.size(); ++i) { for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i]; const auto* x = xs[i];
auto* out = outs[i]; auto* out = outs[i];
...@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> ...@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace()); T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = x->numel(); int num = x->numel();
int block = 512; int block = 1024;
int grid = (num + block - 1) / block; int grid = (num + block - 1) / block;
VLOG(3) << "launch kernel"; VLOG(3) << "launch kernel";
AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>( CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, scale_data, num, found_inf_data, out_data); x_data, inverse_scale_v, num, found_inf_data, out_data);
VLOG(3) << "finish kernel"; VLOG(3) << "finish kernel";
} }
} }
...@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> ...@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
amp_check_finite_and_scale, ops::CheckFiniteAndUnscaleGpuKernel<float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext, ops::CheckFiniteAndUnscaleGpuKernel<double>);
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
double>);
...@@ -16,51 +16,16 @@ limitations under the License. */ ...@@ -16,51 +16,16 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/isfinite_op.h" #include "paddle/fluid/operators/isfinite_op.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename T>
class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> { inline HOSTDEVICE T Inverse(T s) {
public: return 1.0 / s;
void Compute(const framework::ExecutionContext& ctx) const { }
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
if (*is_finite_data) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
eigen_out.device(dev) = (*scale_data) * eigen_in;
} else {
*found_inf_data = true;
break;
}
}
}
return;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UpdateLossScalingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
"update_loss_scaling");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->SetOutputDim("LossScaling", {1});
ctx->SetOutputDim("OutGoodSteps", {1});
ctx->SetOutputDim("OutBadSteps", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
ctx.device_context());
}
};
class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensors) The input tensors of update_loss_scaling operator.")
.AsDuplicable();
AddInput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"whether there is any infinite gradient.");
AddInput("PrevLossScaling",
"(Tensor) 1-dim tensor, previous loss scaling.");
AddInput("InGoodSteps",
"(Tensor) 1-dim tensor, accumulates good steps in which all "
"gradients are finite.");
AddInput("InBadSteps",
"(Tensor) 1-dim tensor, accumulates bad steps in which some "
"gradients are infinite.");
AddOutput("Out",
"(Tensors) The output tensor of update_loss_scaling operator.")
.AsDuplicable();
AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
AddAttr<int>("incr_every_n_steps",
"A value represents increasing loss scaling every n "
"consecutive steps with finite gradients.");
AddAttr<int>("decr_every_n_nan_or_inf",
"A value represents decreasing loss scaling every n "
"accumulated steps with nan or inf gradients.");
AddAttr<float>("incr_ratio",
"The multiplier to use when increasing the loss scaling.")
.AddCustomChecker([](float incr_ratio) {
PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be greater than 1, but "
"the received is %f",
incr_ratio));
});
AddAttr<float>(
"decr_ratio",
"The less-than-one-multiplier to use when decreasing loss scaling.")
.AddCustomChecker([](float decr_ratio) {
PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be between 0 and 1, but "
"the received is %f",
decr_ratio));
});
AddComment(R"DOC(
Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite.
)DOC");
}
};
template <typename T>
class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data,
bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
if (*found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
std::memset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(
update_loss_scaling, ops::UpdateLossScalingOp,
ops::UpdateLossScalingOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<CPU, float>,
ops::UpdateLossScalingKernel<CPU, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void GpuUpdateLossScaling(
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
template <typename T>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
bool has_inf{false};
memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
sizeof(bool), dev_ctx.stream());
if (has_inf) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
cudaMemset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using GPU = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<GPU, float>,
ops::UpdateLossScalingKernel<GPU, double>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
HOSTDEVICE void Update(const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data,
int* bad_out_data) {
if (*found_inf_data) {
*good_out_data = 0;
*bad_out_data = *bad_in_data + 1;
if (*bad_out_data == decr_every_n_nan_or_inf) {
T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
*updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
? static_cast<T>(1)
: new_loss_scaling;
*bad_out_data = 0;
}
} else {
*bad_out_data = 0;
*good_out_data = *good_in_data + 1;
if (*good_out_data == incr_every_n_steps) {
T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
*updated_loss_scaling_data = std::isfinite(new_loss_scaling)
? new_loss_scaling
: *pre_loss_scaling_data;
*good_out_data = 0;
}
}
}
template <typename DeviceContext, typename T>
class UpdateLossScalingFunctor {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const;
};
template <typename DeviceContext, typename T>
class LazyZeroInputs {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const;
};
template <typename DeviceContext, typename T>
class UpdateLossScalingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
platform::errors::InvalidArgument(
"FoundInfinite must has only one element."));
const bool* found_inf_data = found_inf->data<bool>();
const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
const int* good_in_data = good_in->data<int>();
const int* bad_in_data = bad_in->data<int>();
auto& dev_ctx = ctx.template device_context<DeviceContext>();
T* updated_loss_scaling_data =
updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
const int decr_every_n_nan_or_inf =
ctx.Attr<int>("decr_every_n_nan_or_inf");
const float incr_ratio = ctx.Attr<float>("incr_ratio");
const float decr_ratio = ctx.Attr<float>("decr_ratio");
UpdateLossScalingFunctor<DeviceContext, T>{}(
dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
}
};
} // namespace operators
} // namespace paddle
...@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place // CompareOp kernel's device type is decided by input tensor place
bool force_cpu = ctx.Attr<bool>("force_cpu"); bool force_cpu = ctx.Attr<bool>("force_cpu");
kt.place_ = force_cpu ? platform::CPUPlace() if (force_cpu) {
: ctx.Input<framework::LoDTensor>("X")->place(); kt.place_ = platform::CPUPlace();
} else {
if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
typeid(platform::CUDAPinnedPlace)) {
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
} else {
kt.place_ = ctx.GetPlace();
}
}
return kt; return kt;
} }
}; };
......
...@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo ...@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
if(WITH_GPU) if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) set(TMPDEPS memory)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub) set(TMPDEPS memory cub)
endif()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
else() else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc) detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
......
...@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) { ...@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
} else if (p == INFINITY || p == -INFINITY) { } else if (p == INFINITY || p == -INFINITY) {
// p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
// j!=i, or equals to sign(z_i) * dout if j=i. // j!=i, or equals to sign(z_i) * dout if j=i.
grad_t.device(place) = if (platform::is_cpu_place(context.GetPlace())) {
(x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() * grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
sign * out_grad_t.broadcast(out_bcast_dims); .template cast<T>() *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
.template cast<T>() *
sign * out_grad_t.broadcast(out_bcast_dims);
}
} else { } else {
// dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
grad_t.device(place) = if (platform::is_cpu_place(context.GetPlace())) {
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * grad_t.device(place) =
out_grad_t.broadcast(out_bcast_dims); (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
out_grad_t.broadcast(out_bcast_dims);
}
} }
Eigen::DSizes<int, Rank * 2> x_reshape_dims; Eigen::DSizes<int, Rank * 2> x_reshape_dims;
......
...@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp, ...@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_floordiv, elementwise_floordiv,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>, ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
int64_t>); int64_t>);
...@@ -19,7 +19,5 @@ namespace plat = paddle::platform; ...@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_floordiv, elementwise_floordiv,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>, ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>); ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
...@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx, ...@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) { const framework::Tensor *y, framework::Tensor *z) {
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims(); ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
auto y_dims = y->dims(); ctx, x, y, axis, FloorDivFunctor<T>(), z);
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
}
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp { ...@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
using mkldnn::memory; if (platform::CanMKLDNNBeUsed(ctx)) {
auto CanMKLDNNElementwiseMulBeUsed = [&]() {
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto y_dims = ctx.Input<Tensor>("Y")->dims();
int rankdiff = x_dims.size() - y_dims.size();
// TODO(jczaja): Remove this when oneDNN performance for scalar
// broadcasting
// is improved (Ernie large situation)
if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
return false;
}
return true;
};
if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(), return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN, framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN); framework::LibraryType::kMKLDNN);
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("ShapeTensor",
"(Tensor<int>), optional). The shape of the output."
"It has a higher priority than Attr(shape).")
.AsDispensable();
AddInput("ShapeTensorList",
"(vector<Tensor<int>>, optional). The shape of the output. "
"It has a higher priority than Attr(shape)."
"The shape of the element in vector must be [1].")
.AsDuplicable()
.AsDispensable();
AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output")
.SetDefault({});
AddAttr<int>("dtype", "The data type of output tensor, Default is float")
.SetDefault(framework::proto::VarType::FP32);
AddOutput("Out", "(Tensor) The output tensor.");
AddComment(R"DOC(empty operator
Returns a tensor filled with uninitialized data. The shape of the tensor is
defined by the variable argument shape.
The type of the tensor is specify by `dtype`.
)DOC");
}
};
class EmptyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* context) const override {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
if (context->HasInput("ShapeTensor")) {
auto dims = context->GetInputDim("ShapeTensor");
int num_ele = 1;
for (int i = 0; i < dims.size(); ++i) {
num_ele *= dims[i];
}
context->SetOutputDim("Out", framework::make_ddim({num_ele}));
} else if (context->HasInputs("ShapeTensorList")) {
std::vector<int> out_dims;
auto dims_list = context->GetInputsDim("ShapeTensorList");
for (size_t i = 0; i < dims_list.size(); ++i) {
auto& dims = dims_list[i];
PADDLE_ENFORCE_EQ(
dims, framework::make_ddim({1}),
"ShapeError: The shape of Tensor in list must be [1]. "
"But received the shape "
"is [%s]",
dims);
out_dims.push_back(dims[0]);
}
context->SetOutputDim("Out", framework::make_ddim(out_dims));
} else {
auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
context->SetOutputDim("Out", framework::make_ddim(shape));
}
}
protected:
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& context) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(context.Attr<int>("dtype")),
context.GetPlace());
}
};
class EmptyOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* context) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, context->GetAttr("dtype")));
context->SetOutputDataType("Out", data_type);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
ops::EmptyKernel<plat::CPUDeviceContext, int>,
ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
ops::EmptyKernel<plat::CPUDeviceContext, float>,
ops::EmptyKernel<plat::CPUDeviceContext, double>,
ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
ops::EmptyKernel<plat::CUDADeviceContext, int>,
ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
ops::EmptyKernel<plat::CUDADeviceContext, float>,
ops::EmptyKernel<plat::CUDADeviceContext, double>,
ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class EmptyKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto dtype = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
Tensor *out_tensor = context.Output<Tensor>("Out");
auto shape = GetShape(context);
out_tensor->Resize(shape);
out_tensor->mutable_data(context.GetPlace(), dtype);
}
};
} // namespace operators
} // namespace paddle
...@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
template <typename T>
class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_times_tensor")) {
op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
}
if (this->HasInput("ExpandTimes")) {
op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
}
op->SetAttrMap(this->Attrs());
op->SetType("expand");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
} // namespace operators } // namespace operators
...@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, ...@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
ops::ExpandGradOpMaker<paddle::framework::OpDesc>, ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandGradOpMaker<paddle::imperative::OpBase>); ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp, REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandGradNoNeedBufVarsInferer); ops::ExpandGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>, expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
template <typename T>
class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("expand_v2");
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_shapes_tensor")) {
op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
}
if (this->HasInput("Shape")) {
op->SetInput("Shape", this->Input("Shape"));
}
op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
} // namespace operators } // namespace operators
...@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker, ...@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>, ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>); ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp, REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandV2GradNoNeedBufVarsInferer); ops::ExpandV2GradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>, expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -27,27 +27,6 @@ namespace operators { ...@@ -27,27 +27,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
std::string op_type) {
// 1. shape is a Tensor
if (ctx.HasInput("ShapeTensor")) {
auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
auto vec_shape = GetDataFromTensor<int>(shape_tensor);
return framework::make_ddim(vec_shape);
}
// 2. shape is a list/tuple containing Tensor
auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
if (shape_tensor_list.size() > 0) {
auto vec_shape = GetDataFromTensorList(shape_tensor_list);
return framework::make_ddim(vec_shape);
}
// 3. shape is a list/tuple without containing Tensor
auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
return framework::make_ddim(vec_shape);
}
template <typename T> template <typename T>
class FillConstantKernel : public framework::OpKernel<T> { class FillConstantKernel : public framework::OpKernel<T> {
public: public:
...@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> { ...@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
} }
value = tensor_data[0]; value = tensor_data[0];
} }
const std::string op_type = "fill_constant"; auto shape = GetShape(ctx);
auto shape = GetShape(ctx, op_type);
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
......
...@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number); PADDLE_ENFORCE_LT(
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
...@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number); PADDLE_ENFORCE_LT(
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
......
...@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
platform::errors::InvalidArgument( ? framework::flatten_to_2d(x_dims, 1)
"Input(X)'s rank must be 2, but received input dim " : x_dims;
"size is:%d, input dim is:[%s]", PADDLE_ENFORCE_EQ(
x_dims.size(), x_dims)); x_mat_dims.size(), 2,
platform::errors::InvalidArgument("The size of input X dims should be 2, "
"or 3 with second dimension equal to "
"1, but now Input X dim is:[%s] ",
x_dims));
auto wx_dims = ctx->GetInputDim("WeightX"); auto wx_dims = ctx->GetInputDim("WeightX");
PADDLE_ENFORCE_EQ(wx_dims.size(), 2, PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
...@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"The rank of Input(WeightX) should be 2, but received " "The rank of Input(WeightX) should be 2, but received "
"WeightX dim size is:%d, WeightX dim is:[%s] ", "WeightX dim size is:%d, WeightX dim is:[%s] ",
wx_dims.size(), wx_dims)); wx_dims.size(), wx_dims));
PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], PADDLE_ENFORCE_EQ(
platform::errors::InvalidArgument( wx_dims[0], x_mat_dims[1],
"The first dimension of Input(WeightX) " platform::errors::InvalidArgument(
"should equal to second dimension of input x, but " "The first dimension of flattened WeightX"
"received WeightX dimension is:%d, x dimension is:%d", "should equal to last dimension of flattened input X, but "
wx_dims[0], x_dims[1])); "received fattened WeightX dimension is:%d, flattened X dimension "
"is:%d",
wx_dims[0], x_mat_dims[1]));
int frame_size = wx_dims[1] / 3; int frame_size = wx_dims[1] / 3;
auto wh_dims = ctx->GetInputDim("WeightH"); auto wh_dims = ctx->GetInputDim("WeightH");
...@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"received bias dim is:[%s], frame size is:%d", "received bias dim is:[%s], frame size is:%d",
b_dims, frame_size)); b_dims, frame_size));
} }
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_mat_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
int xx_width; int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) { if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1]; xx_width = wx_dims[1];
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0", OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
"fusion_gru"); "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput", OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
"fusion_gru"); "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut", OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
"fusion_gru"); "fusion_gru");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims); ctx->SetOutputDim("BatchedOut", out_dims);
} }
ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
ctx->ShareLoD("X", "XX"); ctx->ShareLoD("X", "XX");
} }
...@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() { ...@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "int8", "bfloat16"});
AddAttr<float>("Scale_data",
"Scale to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Shift_data",
"Shift to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(0.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
The Fusion complete GRU Operator. The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU, This operator fuse the fully-connected operator into GRU,
...@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
} }
} }
#define INIT_BASE_DEFINES \ #define INIT_BASE_DEFINES \
auto* x = ctx.Input<LoDTensor>("X"); \ auto* x = ctx.Input<LoDTensor>("X"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \ auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \ auto* xx = ctx.Output<LoDTensor>("XX"); \
auto x_lod = x->lod(); \ auto x_lod = x->lod(); \
auto x_dims = x->dims(); /* T x M*/ \ auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 3D*/ \ auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) \
const int total_T = x_dims[0]; \ ? framework::flatten_to_2d(x_dims, 1) \
: x_dims; \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_mat_dims[0]; \
const int D3 = wh_dims[1] const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \ #define INIT_OTHER_DEFINES \
...@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto* bias = ctx.Input<Tensor>("Bias"); \ auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \ auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \ bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \ const int M = x_mat_dims[1]; \
const int D = wh_dims[0]; \ const int D = wh_dims[0]; \
const int D2 = D * 2; \ const int D2 = D * 2; \
const jit::gru_attr_t attr( \ const jit::gru_attr_t attr( \
......
...@@ -21,11 +21,12 @@ namespace operators { ...@@ -21,11 +21,12 @@ namespace operators {
using paddle::framework::LoDTensor; using paddle::framework::LoDTensor;
using paddle::framework::Tensor; using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext; using paddle::platform::CPUDeviceContext;
using paddle::platform::CreateKey;
using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNGetDataType;
using paddle::platform::MKLDNNMemDesc; using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast; using platform::to_void_cast;
template <typename T> template <typename T, typename T_out = T>
class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
public: public:
GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
...@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
const std::string& unique_name) const std::string& unique_name)
: platform::MKLDNNHandlerT<T, dnnl::gru_forward>( : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place, dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(unique_name, Ti)), CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
N(N), N(N),
Ti(Ti), Ti(Ti),
IC(IC), IC(IC),
...@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// do not depend on Ti size but primitive and input/output memory do // do not depend on Ti size but primitive and input/output memory do
if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() != if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) { platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
memory_key_ = unique_name; memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
} else { } else {
memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr(); memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
platform::ThreadIDasStr());
}
// Is it int8 kernel
const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (is_INT8) {
// Int8 attributes
const float scale_data = ctx.Attr<float>("Scale_data");
const float shift_data = ctx.Attr<float>("Shift_data");
const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
const int weights_scale_mask =
0 +
(1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
+
(1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
attr_.set_rnn_data_qparams(scale_data, shift_data);
attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
} }
if (!this->isCached()) { if (!this->isCached()) {
...@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
platform::errors::Unimplemented( platform::errors::Unimplemented(
"oneDNN fusion_gru supports only tanh as an activation.")); "oneDNN fusion_gru supports only tanh as an activation."));
// Weights for int8 kernel are of a type s8
const auto weights_dt =
is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
// oneDNN RNN dimensions // oneDNN RNN dimensions
const int64_t D = 1; // Directions const int64_t D = 1; // Directions
const int64_t L = 1; // Layers (PP supports only 1 stacked layer) const int64_t L = 1; // Layers (PP supports only 1 stacked layer)
...@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// Create memory descriptors // Create memory descriptors
auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(), auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any); MKLDNNMemoryFormat::any);
auto weight_x_md = MKLDNNMemDesc( auto weight_x_md =
{L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any); MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto weight_h_md = MKLDNNMemDesc( auto weight_h_md =
{L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any); MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(), auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo); MKLDNNMemoryFormat::ldgo);
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(), auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
MKLDNNMemoryFormat::any); MKLDNNMemoryFormat::any);
auto h0_md = dnnl::memory::desc(); auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
if (h0) { MKLDNNMemoryFormat::ldnc);
h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
}
// Create GRU oneDNN primitive // Create GRU oneDNN primitive
const auto direction = const auto direction =
...@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
: dnnl::rnn_direction::unidirectional_left2right; : dnnl::rnn_direction::unidirectional_left2right;
this->AcquireForwardPrimitiveDescriptor( this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, direction, input_md, h0_md, attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc()); weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
} }
} }
...@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
dnnl::memory::format_tag::ntc); dnnl::memory::format_tag::ntc);
} }
void reorderRNNdata(const T* input_data, T* output_data, void reorderRNNdata(void* input_data, void* output_data,
std::vector<size_t> lod, const bool is_reverse, std::vector<size_t> lod, const bool is_reverse,
platform::RNNReorderType reorder_type) { platform::RNNReorderType reorder_type) {
switch (reorder_type) { switch (reorder_type) {
// Reorder input memory [WORDS, C] + LoD -> [N, T, C] // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case platform::RNNReorderType::PP_NTC: { case platform::RNNReorderType::PP_NTC: {
auto* input_data_iter = input_data; auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * IC; const auto num_elements = (lod[n + 1] - lod[n]) * IC;
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0; const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
memcpy(output_data + n * Ti * IC + offset, input_data_iter, memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
sizeof(T) * num_elements); sizeof(T) * num_elements);
input_data_iter += num_elements; input_data_iter += num_elements;
} }
} break; } break;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C] // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case platform::RNNReorderType::PP_TNC: { case platform::RNNReorderType::PP_TNC: {
auto* input_data_iter = input_data; auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]); const auto num_elements = (lod[n + 1] - lod[n]);
const auto offset = is_reverse ? (Ti - num_elements) : 0; const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) { for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data + (t + offset) * N * IC + n * IC, memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
input_data_iter, sizeof(T) * IC); input_data_iter, sizeof(T) * IC);
input_data_iter += IC; input_data_iter += IC;
} }
...@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
} break; } break;
// Reorder output values to PP format [N, T, C] -> [WORDS, C] // Reorder output values to PP format [N, T, C] -> [WORDS, C]
case platform::RNNReorderType::NTC_PP: { case platform::RNNReorderType::NTC_PP: {
auto* output_data_iter = output_data; auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * OC; const auto num_elements = (lod[n + 1] - lod[n]) * OC;
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0; const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
memcpy(output_data_iter, input_data + n * Ti * OC + offset, memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
sizeof(T) * num_elements); sizeof(T_out) * num_elements);
output_data_iter += num_elements; output_data_iter += num_elements;
} }
} break; } break;
// Reorder output values to PP format [T, N, C] -> [WORDS, C] // Reorder output values to PP format [T, N, C] -> [WORDS, C]
case platform::RNNReorderType::TNC_PP: { case platform::RNNReorderType::TNC_PP: {
auto* output_data_iter = output_data; auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
const auto num_elements = lod[n + 1] - lod[n]; const auto num_elements = lod[n + 1] - lod[n];
const auto offset = is_reverse ? (Ti - num_elements) : 0; const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) { for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter, memcpy(output_data_iter,
input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC); input_data_iter + (t + offset) * N * OC + n * OC,
sizeof(T_out) * OC);
output_data_iter += OC; output_data_iter += OC;
} }
} }
...@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
} }
const auto& input_lod = input->lod()[0]; const auto& input_lod = input->lod()[0];
auto* x_data = input->data<T>(); auto* x_data = to_void_cast(input->data<T>());
auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle()); auto* x_onednn_data = memory_p->get_data_handle();
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC); memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) == if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
...@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
return memory_p; return memory_p;
} }
// TODO(grygielski) H0 is for now persistable
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) { std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0"; const std::string h0_key = memory_key_ + "@h0";
auto memory_p = auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key)); std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
auto* h0_data = to_void_cast(h0->data<T>());
if (!memory_p) { if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>( auto user_h0_memory = dnnl::memory();
this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data); if (h0) {
user_h0_memory =
dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(h0->data<float>()));
} else {
user_h0_memory = dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_);
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
}
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_h0_memory, *memory_p, attr_)
.execute(astream, user_h0_memory, *memory_p);
this->dev_ctx_.SetBlob(h0_key, memory_p); this->dev_ctx_.SetBlob(h0_key, memory_p);
} else {
memory_p->set_data_handle(h0_data);
} }
return memory_p; return memory_p;
} }
...@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_layer_desc(), this->engine_); this->fwd_pd_->weights_layer_desc(), this->engine_);
dnnl::stream astream(this->engine_); dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p) dnnl::reorder(user_memory, *memory_p, attr_)
.execute(astream, user_memory, *memory_p); .execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wx_key, memory_p); this->dev_ctx_.SetBlob(wx_key, memory_p);
...@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_iter_desc(), this->engine_); this->fwd_pd_->weights_iter_desc(), this->engine_);
dnnl::stream astream(this->engine_); dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p) dnnl::reorder(user_memory, *memory_p, attr_)
.execute(astream, user_memory, *memory_p); .execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wh_key, memory_p); this->dev_ctx_.SetBlob(wh_key, memory_p);
...@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// Memory size of weights, bias and h0 does not depend // Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them // on Ti size, thus we need another key to cache them
std::string memory_key_; std::string memory_key_;
dnnl::primitive_attr attr_;
}; };
template <typename T> template <typename T>
class FusionGRUMKLDNNKernel : public framework::OpKernel<T> { class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const bool is_INT8 = std::is_same<T, uint8_t>::value;
const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
// TODO(grygielski) Add option for bfloat
if (!is_INT8 || force_fp32_output) {
RunKernel<float>(ctx);
} else {
RunKernel<uint8_t>(ctx);
}
}
template <typename Tout = T>
void RunKernel(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>(); ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
...@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> { ...@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
const auto* weight_h = ctx.Input<Tensor>("WeightH"); const auto* weight_h = ctx.Input<Tensor>("WeightH");
const auto* bias = ctx.Input<Tensor>("Bias"); const auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden = ctx.Output<LoDTensor>("Hidden"); auto* hidden = ctx.Output<LoDTensor>("Hidden");
auto x_dims = input->dims();
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
? framework::flatten_to_2d(x_dims, 1)
: x_dims;
// Get attributes // Get attributes
const bool is_reverse = ctx.Attr<bool>("is_reverse"); const bool is_reverse = ctx.Attr<bool>("is_reverse");
const bool origin_mode = ctx.Attr<bool>("origin_mode"); const bool origin_mode = ctx.Attr<bool>("origin_mode");
// Get tensor dimensions // Get tensor dimensions
const auto x_dims = framework::vectorize(input->dims()); const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
const auto weight_h_dims = framework::vectorize(weight_h->dims()); const auto weight_h_dims = framework::vectorize(weight_h->dims());
const auto& input_lod = input->lod()[0]; const auto& input_lod = input->lod()[0];
...@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> { ...@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
} }
return res; return res;
}(); }();
const int64_t IC = x_dims[1]; // Input channels const int64_t IC = x_mat_dims_vec[1]; // Input channels
const int64_t OC = weight_h_dims[0]; // Output channels const int64_t OC = weight_h_dims[0]; // Output channels
GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), GRUMKLDNNHandler<T, Tout> handler(
input, weight_h, h0, is_reverse, N, Ti, IC, OC, ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
ctx.InputName("X") + ctx.InputName("WeightH")); is_reverse, N, Ti, IC, OC,
ctx.InputName("X") + ctx.InputName("WeightH"));
auto input_memory_p = auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse); handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto h0_memory_p = handler.AcquireH0Memory(h0);
auto weight_x_memory_p = auto weight_x_memory_p =
handler.AcquireWeightXMemory(weight_x, origin_mode); handler.AcquireWeightXMemory(weight_x, origin_mode);
auto weight_h_memory_p = auto weight_h_memory_p =
...@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> { ...@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
std::unordered_map<int, dnnl::memory> gru_args = { std::unordered_map<int, dnnl::memory> gru_args = {
{DNNL_ARG_SRC_LAYER, *input_memory_p}, {DNNL_ARG_SRC_LAYER, *input_memory_p},
{DNNL_ARG_SRC_ITER, *h0_memory_p},
{DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p}, {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
{DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p}, {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
{DNNL_ARG_BIAS, *bias_memory_p}, {DNNL_ARG_BIAS, *bias_memory_p},
{DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}}; {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
if (h0) {
auto h0_memory_p = handler.AcquireH0Memory(h0);
gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
}
auto gru_forward_p = handler.AcquireForwardPrimitive(); auto gru_forward_p = handler.AcquireForwardPrimitive();
dnnl::stream astream(mkldnn_engine); dnnl::stream astream(mkldnn_engine);
gru_forward_p->execute(astream, gru_args); gru_forward_p->execute(astream, gru_args);
astream.wait(); astream.wait();
auto* hidden_onednn_data = auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle()); auto* hidden_data =
auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace()); to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
if (handler.is_NTC()) { if (handler.is_NTC()) {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod, handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::NTC_PP); is_reverse, platform::RNNReorderType::NTC_PP);
...@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> { ...@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace, REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
ops::FusionGRUMKLDNNKernel<float>); ops::FusionGRUMKLDNNKernel<float>,
ops::FusionGRUMKLDNNKernel<uint8_t>);
...@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std); std::normal_distribution<T> dist(mean, std);
const std::string op_type = "gaussian_random"; auto shape = GetShape(context);
auto shape = GetShape(context, op_type);
tensor->Resize(shape); tensor->Resize(shape);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
......
...@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
T mean = static_cast<T>(context.Attr<float>("mean")); T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std")); T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
const std::string op_type = "gaussian_random"; auto shape = GetShape(context);
auto shape = GetShape(context, op_type);
tensor->Resize(shape); tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
......
...@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
private: private:
std::vector<std::string> in_names_; std::vector<std::string> in_names_;
std::vector<std::string> out_names_; std::vector<std::string> out_names_;
paddle::lite::Predictor *engine_; paddle::lite_api::PaddlePredictor *engine_;
framework::proto::VarType::Type precision_; framework::proto::VarType::Type precision_;
bool use_gpu_; bool use_gpu_;
bool zero_copy_; bool zero_copy_;
...@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
framework::LoDTensor src_t = framework::LoDTensor src_t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, inference::analysis::GetFromScope<framework::LoDTensor>(scope,
in_names_[i]); in_names_[i]);
paddle::lite::Tensor *dst_t = engine_->GetInput(i); paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> " VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
<< engine_->GetInputNames()[i] << ")"; << engine_->GetInputNames()[i] << ")";
inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_); inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_place)) { if (platform::is_gpu_place(dev_place)) {
...@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
engine_->Run(); engine_->Run();
VLOG(3) << "lite engine run done"; VLOG(3) << "lite engine run done";
for (size_t i = 0; i < out_names_.size(); i++) { for (size_t i = 0; i < out_names_.size(); i++) {
paddle::lite::Tensor src_t = *(engine_->GetOutput(i)); paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
framework::LoDTensor *dst_t = framework::LoDTensor *dst_t =
&inference::analysis::GetFromScope<framework::LoDTensor>( &inference::analysis::GetFromScope<framework::LoDTensor>(
scope, out_names_[i]); scope, out_names_[i]);
......
...@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) { ...@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
inference::lite::EngineConfig config; inference::lite::EngineConfig config;
config.valid_places = { config.valid_places = {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif #endif
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
}; };
serialize_params(&(config.param), &scope, repetitive_params); serialize_params(&(config.param), &scope, repetitive_params);
config.model = program.Proto()->SerializeAsString(); config.model = program.Proto()->SerializeAsString();
......
...@@ -9,7 +9,11 @@ function(math_library TARGET) ...@@ -9,7 +9,11 @@ function(math_library TARGET)
set(hip_srcs) set(hip_srcs)
set(math_common_deps device_context framework_proto enforce) set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU) if (WITH_GPU)
list(APPEND math_common_deps cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif() endif()
set(multiValueArgs DEPS) set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
......
...@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> { ...@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) { const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0]; auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(output->dims(), in_dims); vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input"
" tensor shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
auto in = framework::EigenMatrix<T>::From(input); auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector); auto vec = framework::EigenVector<T>::Flatten(vector);
......
...@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> { ...@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) { const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0]; auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(output->dims(), in_dims); vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(
out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input tensor"
" shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
int blocks = 512; int blocks = 512;
int grids = (input.numel() + blocks - 1) / blocks; int grids = (input.numel() + blocks - 1) / blocks;
RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>( RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
...@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()( ...@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
framework::Tensor* vector) { framework::Tensor* vector) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto size = input.numel() / in_dims[0]; auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), size); PADDLE_ENFORCE_EQ(vector->numel(), size,
platform::errors::InvalidArgument(
"The size of input vector"
" should be equal to the size of input tensor column"
" dimension. Expected vector size=%d, but received %d",
size, vector->numel()));
framework::Tensor one; framework::Tensor one;
one.mutable_data<double>({in_dims[0]}, context.GetPlace()); one.mutable_data<double>({in_dims[0]}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set; SetConstant<platform::CUDADeviceContext, double> set;
...@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()( ...@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
framework::Tensor* vector) { framework::Tensor* vector) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto size = input.numel() / in_dims[0]; auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]); PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of input vector"
" should be equal to the size of input tensor row"
" dimension. Expected vector size=%d, but received %d",
in_dims[0], vector->numel()));
framework::Tensor one; framework::Tensor one;
one.mutable_data<double>({size}, context.GetPlace()); one.mutable_data<double>({size}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set; SetConstant<platform::CUDADeviceContext, double> set;
......
...@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context, ...@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
framework::Tensor* out) { framework::Tensor* out) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto size = input.numel() / in_dims[0]; auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(out->numel(), size); PADDLE_ENFORCE_EQ(out->numel(), size,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor column"
" dimension. Expected output size=%d, but received %d",
size, out->numel()));
auto in = framework::EigenMatrix<T>::From(input); auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out); auto vec = framework::EigenVector<T>::Flatten(*out);
...@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> { ...@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
auto& in_dims = input.dims(); auto& in_dims = input.dims();
auto height = in_dims[0]; auto height = in_dims[0];
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size); PADDLE_ENFORCE_EQ(
out->numel(), size,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor column"
" dimension. Expected output size=%d, but received %d",
size, out->numel()));
T* out_buf = out->mutable_data<T>(out->place()); T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
...@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context, ...@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
framework::Tensor* out) { framework::Tensor* out) {
auto in_dims = input.dims(); auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U); PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); "The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
in_dims[0], out->numel()));
auto in = framework::EigenMatrix<T>::From(input); auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out); auto vec = framework::EigenVector<T>::Flatten(*out);
...@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> { ...@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) { const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims(); auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U); PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
auto height = in_dims[0]; auto height = in_dims[0];
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height); PADDLE_ENFORCE_EQ(
out->numel(), height,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
height, out->numel()));
auto inv_size = 1.0 / size; auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->place()); T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
...@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context, ...@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
framework::Tensor* out) { framework::Tensor* out) {
auto in_dims = input.dims(); auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U); PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); "The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
in_dims[0], out->numel()));
auto in = framework::EigenMatrix<T>::From(input); auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out); auto vec = framework::EigenVector<T>::Flatten(*out);
...@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> { ...@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) { const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims(); auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U); PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
auto height = in_dims[0]; auto height = in_dims[0];
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height); PADDLE_ENFORCE_EQ(
out->numel(), height,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
height, out->numel()));
T* out_buf = out->mutable_data<T>(out->place()); T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
......
...@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) { ...@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
auto* ctx = new paddle::platform::CPUDeviceContext(); auto* ctx = new paddle::platform::CPUDeviceContext();
paddle::operators::math::set_constant(*ctx, &t, 10); paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) { for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i]); PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
paddle::platform::errors::InvalidArgument(
"Each value of input"
"tensor should be 10, but received %d.",
t.data<int>()[i]));
} }
delete ctx; delete ctx;
} }
......
...@@ -18,7 +18,12 @@ ...@@ -18,7 +18,12 @@
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
const std::vector<float>& data) { const std::vector<float>& data) {
PADDLE_ENFORCE_EQ(size, data.size()); PADDLE_ENFORCE_EQ(
size, data.size(),
paddle::platform::errors::InvalidArgument(
"The size of argument data should"
" be equal to the argument size. Expected %d, but received %d.",
size, data.size()));
for (size_t i = 0; i < data.size(); ++i) { for (size_t i = 0; i < data.size(); ++i) {
in_ptr[i] = paddle::platform::float16(data[i]); in_ptr[i] = paddle::platform::float16(data[i]);
} }
......
...@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context, ...@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out); PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more than 6 dimensions."); "PadOp only support tensors with no more"
" than 6 dimensions currently."));
} }
} }
...@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, ...@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out); PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more than 6 dimensions."); "PadOp only support tensors with no more"
" than 6 dimensions currently."));
} }
} }
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#include <random> #include <random>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -31,7 +33,10 @@ namespace math { ...@@ -31,7 +33,10 @@ namespace math {
class Sampler { class Sampler {
public: public:
explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) { explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
// PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0."); PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
"Range should be"
" greater than 0, but recevied %d.",
range));
if (seed == 0) { if (seed == 0) {
std::random_device r; std::random_device r;
seed_ = r(); seed_ = r();
......
...@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> { ...@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
const framework::SelectedRows& input2, const framework::SelectedRows& input2,
framework::SelectedRows* output) { framework::SelectedRows* output) {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height()); PADDLE_ENFORCE_EQ(
in1_height, input2.height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2.height()));
output->set_height(in1_height); output->set_height(in1_height);
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
...@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> { ...@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
auto& in2_value = input2.value(); auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size(); auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); in1_row_numel, in2_value.numel() / in2_rows.size(),
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, in2_value.numel() / in2_rows.size()));
PADDLE_ENFORCE_EQ(
in1_row_numel, out_value->numel() / out_rows.size(),
platform::errors::InvalidArgument(
"The input and oupput width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, out_value->numel() / out_rows.size()));
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto in2_place = input2.place(); auto in2_place = input2.place();
PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_cpu_place(out_place)); PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto* out_data = out_value->data<T>(); auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
...@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> { ...@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
auto in2_dims = input2.dims(); auto in2_dims = input2.dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height, out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But recieved input height = [%d], output height = [%d]",
in1_height, out_dims[0]));
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); in1_row_numel, input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel, output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, output->numel() / in1_height));
SetConstant<platform::CPUDeviceContext, T> functor; SetConstant<platform::CPUDeviceContext, T> functor;
functor(context, output, 0.0); functor(context, output, 0.0);
...@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> { ...@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
const int64_t input2_offset, const int64_t input2_offset,
framework::SelectedRows* input2) { framework::SelectedRows* input2) {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(
in1_height, input2->height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2->height()));
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
...@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> { ...@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
in2_rows.Extend(in1_rows.begin(), in1_rows.end()); in2_rows.Extend(in1_rows.begin(), in1_rows.end());
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto in2_place = input2->place(); auto in2_place = input2->place();
PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
...@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> { ...@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
auto& in_rows = (*iter)->rows(); auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin(); size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height(); auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height(),
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But recieved first input height = [%d], second "
"input height = [%d]",
in1_height, input2->height()));
} }
// concat rows // concat rows
std::vector<int64_t> in2_rows; std::vector<int64_t> in2_rows;
...@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> { ...@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
} }
auto in1_height = input1.height(); auto in1_height = input1.height();
auto in2_dims = input2->dims(); auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>(); auto* input2_data = input2->data<T>();
...@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
continue; continue;
} }
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same " platform::errors::InvalidArgument(
"dimension except for the first one"); "All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(), PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height"); platform::errors::InvalidArgument(
"All inputs should have same height."));
row_num += input->rows().size(); row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end()); merged_row_set.insert(input->rows().begin(), input->rows().end());
} }
...@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> { ...@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
continue; continue;
} }
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same " platform::errors::InvalidArgument(
"dimension except for the first one"); "All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(), PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height"); platform::errors::InvalidArgument(
"All input should have same height."));
row_num += input->rows().size(); row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end()); merged_row_set.insert(input->rows().begin(), input->rows().end());
} }
...@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> { ...@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
framework::Tensor* input2) { framework::Tensor* input2) {
auto in1_height = input1.height(); auto in1_height = input1.height();
auto in2_dims = input2->dims(); auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>(); auto* input2_data = input2->data<T>();
......
...@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
const framework::SelectedRows& input2, const framework::SelectedRows& input2,
framework::SelectedRows* output) { framework::SelectedRows* output) {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height()); PADDLE_ENFORCE_EQ(
in1_height, input2.height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2.height()));
output->set_height(in1_height); output->set_height(in1_height);
framework::Vector<int64_t> in1_rows(input1.rows()); framework::Vector<int64_t> in1_rows(input1.rows());
...@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto& in2_value = input2.value(); auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size(); auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); in1_row_numel, in2_value.numel() / in2_rows.size(),
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, in2_value.numel() / in2_rows.size()));
PADDLE_ENFORCE_EQ(
in1_row_numel, out_value->numel() / out_rows.size(),
platform::errors::InvalidArgument(
"The input and oupput width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, out_value->numel() / out_rows.size()));
auto* out_data = out_value->data<T>(); auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto in2_place = input2.place(); auto in2_place = input2.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true); PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data, memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data, BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
...@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
auto in2_dims = input2.dims(); auto in2_dims = input2.dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); in1_height, in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But recieved first input height = [%d], first input height = [%d]",
in1_height, in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height, out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But recieved input height = [%d], output height = [%d]",
in1_height, out_dims[0]));
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); in1_row_numel, input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel, output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, output->numel() / in1_height));
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>(); auto* in2_data = input2.data<T>();
...@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
const int64_t input2_offset, const int64_t input2_offset,
framework::SelectedRows* input2) { framework::SelectedRows* input2) {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(
in1_height, input2->height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2->height()));
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
...@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
} }
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto in2_place = input2->place(); auto in2_place = input2->place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
...@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
framework::Tensor* input2) { framework::Tensor* input2) {
auto in1_height = input1.height(); auto in1_height = input1.height();
auto in2_dims = input2->dims(); auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>(); auto* in2_data = input2->data<T>();
...@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
continue; continue;
} }
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same " platform::errors::InvalidArgument(
"dimension except for the first one"); "All input should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(), PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height"); platform::errors::InvalidArgument(
"All input should have same height."));
merged_row_set.insert(input->rows().begin(), input->rows().end()); merged_row_set.insert(input->rows().begin(), input->rows().end());
} }
std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(), std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
...@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> { ...@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
auto in1_height = merged_in1.height(); auto in1_height = merged_in1.height();
auto in2_dims = input2->dims(); auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = merged_in1.value(); auto& in1_value = merged_in1.value();
auto& in1_rows = merged_in1.rows(); auto& in1_rows = merged_in1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.template data<T>(); auto* in1_data = in1_value.template data<T>();
auto* in2_data = input2->data<T>(); auto* in2_data = input2->data<T>();
......
...@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
{static_cast<int64_t>(rows1.size()), row_numel}), {static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place); gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
PADDLE_ENFORCE(cudaDeviceSynchronize()); PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
paddle::platform::errors::PreconditionNotMet(
"The all synchronization on the cuda is error!"));
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{ std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
......
...@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> { ...@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col, const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const { const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4, PADDLE_ENFORCE_EQ(
"The dimension of vol should be 4."); vol.dims().size(), 4,
PADDLE_ENFORCE_EQ(col->dims().size(), 7, platform::errors::InvalidArgument("The dimension of"
"The dimension of col should be 7."); " vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(
col->dims().size(), 7,
platform::errors::InvalidArgument("The dimension of"
"col should be 7, but received %d.",
col->dims().size()));
int input_channels = int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
...@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> { ...@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) / ((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] + strides[0] +
1, 1;
output_depth, PADDLE_ENFORCE_EQ(
"input_depth and output_depth are " input_depth_tmp, output_depth,
"mismatching."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - "input_depth(%d) and output_depth(%d) are mismatching.",
((dilations[1] * (filter_height - 1) + 1))) / input_depth_tmp, output_depth));
strides[1] + auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
1, ((dilations[1] * (filter_height - 1) + 1))) /
output_height, strides[1] +
"input_height and output_height are " 1;
"mismatching."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - input_height_tmp, output_height,
((dilations[2] * (filter_width - 1) + 1))) / platform::errors::InvalidArgument(
strides[2] + "input_height(%d) and output_height(%d) are mismatching.",
1, input_height_tmp, output_height));
output_width, auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
"input_width and output_width are " ((dilations[2] * (filter_width - 1) + 1))) /
"mismatching."); strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
const T* vol_data = vol.data<T>(); const T* vol_data = vol.data<T>();
T* col_data = col->data<T>(); T* col_data = col->data<T>();
...@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> { ...@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol, const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const { const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4, PADDLE_ENFORCE_EQ(
"The dimension of vol should be 4."); vol->dims().size(), 4,
PADDLE_ENFORCE_EQ(col.dims().size(), 7, platform::errors::InvalidArgument("The dimension of vol"
"The dimension of col should be 7."); " should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(
col.dims().size(), 7,
platform::errors::InvalidArgument("The dimension of col"
" should be 7, but received %d.",
col.dims().size()));
int input_channels = int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
...@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> { ...@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) / ((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] + strides[0] +
1, 1;
output_depth, PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
"input_depth and output_depth are " platform::errors::InvalidArgument(
"mismatching."); "input_depth(%d)"
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - " and output_depth(%d) are mismatching.",
((dilations[1] * (filter_height - 1) + 1))) / input_depth_tmp, output_depth));
strides[1] + auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
1, ((dilations[1] * (filter_height - 1) + 1))) /
output_height, strides[1] +
"input_height and output_height are " 1;
"mismatching."); PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - platform::errors::InvalidArgument(
((dilations[2] * (filter_width - 1) + 1))) / "input_height(%d)"
strides[2] + " and output_height(%d) are mismatching.",
1, input_height_tmp, output_height));
output_width, auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
"input_width and output_width are " ((dilations[2] * (filter_width - 1) + 1))) /
"mismatching."); strides[2] +
1;
PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d)"
" and output_width(%d) are mismatching.",
input_width_tmp, output_width));
T* vol_data = vol->data<T>(); T* vol_data = vol->data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
......
...@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> { ...@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col, const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const { const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4, PADDLE_ENFORCE_EQ(
"The dimension of vol should be 4."); vol.dims().size(), 4,
PADDLE_ENFORCE_EQ(col->dims().size(), 7, platform::errors::InvalidArgument("The dimension of"
"The dimension of col should be 7."); " vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(
col->dims().size(), 7,
platform::errors::InvalidArgument("The dimension of"
"col should be 7, but received %d.",
col->dims().size()));
int input_channels = int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
...@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> { ...@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) / ((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] + strides[0] +
1, 1;
output_depth, PADDLE_ENFORCE_EQ(
"input_depth and output_depth are " input_depth_tmp, output_depth,
"mismatching."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - "input_depth(%d) and output_depth(%d) are mismatching.",
((dilations[1] * (filter_height - 1) + 1))) / input_depth_tmp, output_depth));
strides[1] + auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
1, ((dilations[1] * (filter_height - 1) + 1))) /
output_height, strides[1] +
"input_height and output_height are " 1;
"mismatching."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - input_height_tmp, output_height,
((dilations[2] * (filter_width - 1) + 1))) / platform::errors::InvalidArgument(
strides[2] + "input_height(%d) and output_height(%d) are mismatching.",
1, input_height_tmp, output_height));
output_width, auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
"input_width and output_width are " ((dilations[2] * (filter_width - 1) + 1))) /
"mismatching."); strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
int num_outputs = int num_outputs =
input_channels * output_depth * output_height * output_width; input_channels * output_depth * output_height * output_width;
...@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> { ...@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol, const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const { const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4, PADDLE_ENFORCE_EQ(
"The dimension of vol should be 4."); vol->dims().size(), 4,
PADDLE_ENFORCE_EQ(col.dims().size(), 7, platform::errors::InvalidArgument("The dimension of vol"
"The dimension of col should be 7."); " should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(
col.dims().size(), 7,
platform::errors::InvalidArgument("The dimension of col"
" should be 7, but received %d.",
col.dims().size()));
int input_channels = int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
...@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> { ...@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back - auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) / ((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] + strides[0] +
1, 1;
output_depth, PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
"input_depth and output_depth are " platform::errors::InvalidArgument(
"mismatching."); "input_depth(%d)"
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down - " and output_depth(%d) are mismatching.",
((dilations[1] * (filter_height - 1) + 1))) / input_depth_tmp, output_depth));
strides[1] + auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
1, ((dilations[1] * (filter_height - 1) + 1))) /
output_height, strides[1] +
"input_height and output_height are " 1;
"mismatching."); PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right - platform::errors::InvalidArgument(
((dilations[2] * (filter_width - 1) + 1))) / "input_height(%d)"
strides[2] + " and output_height(%d) are mismatching.",
1, input_height_tmp, output_height));
output_width, auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
"input_width and output_width are " ((dilations[2] * (filter_width - 1) + 1))) /
"mismatching."); strides[2] +
1;
PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d)"
" and output_width(%d) are mismatching.",
input_width_tmp, output_width));
int num_kernels = input_channels * input_depth * input_height * input_width; int num_kernels = input_channels * input_depth * input_height * input_width;
......
...@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
float std = context.Attr<float>("std"); float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
const std::string op_type = "gaussian_random"; auto shape = GetShape(context);
auto shape = GetShape(context, op_type);
tensor->Resize(shape); tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
int64_t size = tensor->numel(); int64_t size = tensor->numel();
......
...@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("lars_weight_decay", AddAttr<float>("lars_weight_decay",
"(float, default 0.0005) LARS weight decay") "(float, default 0.0005) LARS weight decay")
.SetDefault(0.0005); .SetDefault(0.0005);
AddAttr<float>("epsilon",
"(float, default 0.0) epsilon to avoid Division by Zero.")
.SetDefault(0.0);
AddComment(R"DOC( AddComment(R"DOC(
Lars Momentum Optimizer. Lars Momentum Optimizer.
......
...@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, ...@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu, const T* learning_rate, const T mu,
const int64_t num, const T lars_coeff, const int64_t num, const T lars_coeff,
const T lars_weight_decay, const T* p_norm, const T lars_weight_decay, const T* p_norm,
const T* g_norm, T* p_out, T* v_out) { const T* g_norm, T* p_out, T* v_out,
const T epsilon) {
T lr = learning_rate[0]; T lr = learning_rate[0];
T local_lr = learning_rate[0]; T local_lr = learning_rate[0];
CUDA_KERNEL_LOOP(i, num) { CUDA_KERNEL_LOOP(i, num) {
if (p_norm[0] > 0 && g_norm[0] > 0) { if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] / local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]); (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
} }
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]); T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
v_out[i] = v_new; v_out[i] = v_new;
p_out[i] = p[i] - v_new; p_out[i] = p[i] - v_new;
...@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> { ...@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu")); T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff"); T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay"); T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto* p = param->data<T>(); auto* p = param->data<T>();
auto* v = velocity->data<T>(); auto* v = velocity->data<T>();
...@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> { ...@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
eg_norm.device(*place) = eigen_g.square().sum().sqrt(); eg_norm.device(*place) = eigen_g.square().sum().sqrt();
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>( MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
p_norm_data, g_norm_data, p_out, v_out); p_norm_data, g_norm_data, p_out, v_out, epsilon);
} }
}; };
......
...@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> { ...@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu")); T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff"); T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay"); T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto p_out = framework::EigenVector<T>::Flatten(*param_out); auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out); auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
...@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> { ...@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
ep_norm = p.square().sum().sqrt(); ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt(); eg_norm = g.square().sum().sqrt();
T local_lr = lr[0]; T local_lr = lr[0];
if (ep_norm(0) > 0 && eg_norm(0) > 0) { if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) / local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0)); (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
} }
v_out = v * mu + local_lr * (g + lars_weight_decay * p); v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out; p_out = p - v_out;
......
include(operators) include(operators)
if(WITH_GPU) if(WITH_GPU)
register_operators(DEPS cub) if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
register_operators(DEPS cub)
else()
register_operators()
endif()
else() else()
register_operators() register_operators()
endif() endif()
...@@ -24,5 +28,9 @@ if(WITH_GPU) ...@@ -24,5 +28,9 @@ if(WITH_GPU)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
else()
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
endif()
endif() endif()
...@@ -13,18 +13,138 @@ ...@@ -13,18 +13,138 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
#include <memory> #include <algorithm>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class LogsumexpOpMaker : public ops::ReduceOpMaker { class LogsumexpOp : public framework::OperatorWithKernel {
protected: public:
virtual std::string GetName() const { return "logsumexp"; } using framework::OperatorWithKernel::OperatorWithKernel;
virtual std::string GetOpType() const { return "Reduce logsumexp"; }
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 4,
platform::errors::InvalidArgument(
"The input tensor X's dimensions of logsumexp "
"should be less equal than 4. But received X's "
"dimensions = %d, X's shape = [%s].",
x_rank, x_dims));
auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
PADDLE_ENFORCE_GT(
axis.size(), 0,
platform::errors::InvalidArgument(
"The size of axis of logsumexp "
"should be greater than 0. But received the size of axis "
"of logsumexp is %d.",
axis.size()));
for (size_t i = 0; i < axis.size(); i++) {
PADDLE_ENFORCE_LT(
axis[i], x_rank,
platform::errors::InvalidArgument(
"axis[%d] should be in the "
"range [-dimension(X), dimension(X)] "
"where dimesion(X) is %d. But received axis[i] = %d.",
i, x_rank, axis[i]));
PADDLE_ENFORCE_GE(
axis[i], -x_rank,
platform::errors::InvalidArgument(
"axis[%d] should be in the "
"range [-dimension(X), dimension(X)] "
"where dimesion(X) is %d. But received axis[i] = %d.",
i, x_rank, axis[i]));
if (axis[i] < 0) {
axis[i] += x_rank;
}
}
bool keepdim = ctx->Attrs().Get<bool>("keepdim");
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
auto dims_vector = vectorize(x_dims);
if (reduce_all) {
if (keepdim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keepdim) {
for (size_t i = 0; i < axis.size(); ++i) {
dims_vector[axis[i]] = 1;
}
} else {
const int kDelFlag = -1;
for (size_t i = 0; i < axis.size(); ++i) {
dims_vector[axis[i]] = kDelFlag;
}
dims_vector.erase(
std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
if (!keepdim && dims_vector.size() == 0) {
dims_vector.push_back(1);
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (axis.size() > 0 && axis[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 4 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"axis",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keepdim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
logsumexp Operator.
This operator computes the logsumexp of input tensor along the given axis.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
)DOC"));
}
};
class LogsumexpGrapOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
"Out@GRAD", "logsumexp");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
}; };
template <typename T> template <typename T>
...@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
public: public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker; using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override { void Apply(GradOpPtr<T> op) const override {
op->SetType("logsumexp_grad"); op->SetType("logsumexp_grad");
op->SetInput("X", this->Input("X")); op->SetInput("X", this->Input("X"));
...@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker, namespace ops = paddle::operators;
REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>, ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>); ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp); REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);
REGISTER_OP_CPU_KERNEL(logsumexp,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::LogsumexpFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::LogsumexpFunctor>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
float, ops::LogsumexpGradFunctor>, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double, REGISTER_OP_CPU_KERNEL(
ops::LogsumexpGradFunctor>); logsumexp_grad,
ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
REGISTER_OP_CUDA_KERNEL(logsumexp, namespace ops = paddle::operators;
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::LogsumexpFunctor>, REGISTER_OP_CUDA_KERNEL(
ops::ReduceKernel<paddle::platform::CUDADeviceContext, logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
double, ops::LogsumexpFunctor>); ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -15,8 +15,9 @@ ...@@ -15,8 +15,9 @@
// .part used to speed up nvcc compile // .part used to speed up nvcc compile
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, logsumexp_grad,
float, ops::LogsumexpGradFunctor>, ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double, ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
ops::LogsumexpGradFunctor>);
...@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> { ...@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
auto output_vars = ctx.MultiOutputVar("Out"); auto output_vars = ctx.MultiOutputVar("Out");
auto input_var_names = ctx.InputNames("X"); auto input_var_names = ctx.InputNames("X");
auto param_names = ctx.InputNames("Params");
auto output_var_names = ctx.OutputNames("Out"); auto output_var_names = ctx.OutputNames("Out");
// current program may not hold parameters
std::vector<std::string> param_names;
if (!param_vars.empty()) {
param_names = ctx.InputNames("Params");
}
auto *block = ctx.Attr<BlockDesc *>("global_block"); auto *block = ctx.Attr<BlockDesc *>("global_block");
auto *program = block->Program(); auto *program = block->Program();
auto start_op_index = ctx.Attr<int64_t>("start_op_index"); auto start_op_index = ctx.Attr<int64_t>("start_op_index");
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册