提交 cc80d961 编写于 作者: J jingqinghe
...@@ -16,6 +16,7 @@ else() ...@@ -16,6 +16,7 @@ else()
set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
endif() endif()
###################################################################################### ######################################################################################
...@@ -188,6 +189,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x ...@@ -188,6 +189,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
endif() endif()
add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"") add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
......
...@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc") ...@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
SET(DGC_URL "http://fleet.bj.bcebos.com/collective_ef2216a.tgz") SET(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
cache_third_party(extern_dgc cache_third_party(extern_dgc
...@@ -30,7 +30,7 @@ ExternalProject_Add( ...@@ -30,7 +30,7 @@ ExternalProject_Add(
extern_dgc extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
"${DGC_DOWNLOAD_CMD}" "${DGC_DOWNLOAD_CMD}"
URL_MD5 "2f67549fd5f1262383d83289abc4f88f" URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
PREFIX "${DGC_PREFIX_DIR}" PREFIX "${DGC_PREFIX_DIR}"
SOURCE_DIR "${DGC_SOURCES_DIR}" SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ...@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite) set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG) if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa) set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
endif() endif()
if(NOT CUDA_ARCH_NAME) if(NOT CUDA_ARCH_NAME)
......
...@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn) ...@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn)
SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git) SET(MKLDNN_REPOSITORY https://github.com/oneapi-src/oneDNN.git)
SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677) SET(MKLDNN_TAG 64a48f9565aa72f6359917b3406328075a409939)
# Introduce variables: # Introduce variables:
# * CMAKE_INSTALL_LIBDIR # * CMAKE_INSTALL_LIBDIR
......
...@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) ...@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git) set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git)
set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e) set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE) CACHE PATH "Warp-ctc Directory" FORCE)
......
...@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag) ...@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
endfunction() endfunction()
CheckCompilerCXX11Flag() CheckCompilerCXX11Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
# safe_set_flag # safe_set_flag
# #
# Set a compile flag only if compiler is support # Set a compile flag only if compiler is support
......
...@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
ENDIF() ENDIF()
if(WITH_GPU) if(WITH_GPU)
include(external/cub) # download cub if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND third_party_deps extern_cub) include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU) endif(WITH_GPU)
......
...@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs( ...@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
for (size_t i = 0; i < op_num; ++i) { for (size_t i = 0; i < op_num; ++i) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true, grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
"Cannot serialize message."); paddle::platform::errors::Unavailable(
"Cannot serialize operator desc message."));
} }
} }
return ret; return ret;
......
...@@ -36,7 +36,10 @@ message AMPConfig { ...@@ -36,7 +36,10 @@ message AMPConfig {
repeated string custom_black_varnames = 9; repeated string custom_black_varnames = 9;
} }
message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; } message LocalSGDConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig { message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ]; optional int32 k_steps = 1 [ default = 1 ];
...@@ -52,6 +55,8 @@ message DGCConfig { ...@@ -52,6 +55,8 @@ message DGCConfig {
message LarsConfig { message LarsConfig {
optional float lars_coeff = 1 [ default = 0.001 ]; optional float lars_coeff = 1 [ default = 0.001 ];
optional float lars_weight_decay = 2 [ default = 0.0005 ]; optional float lars_weight_decay = 2 [ default = 0.0005 ];
optional float epsilon = 3 [ default = 0.0 ];
repeated string exclude_from_weight_decay = 4;
} }
message LambConfig { message LambConfig {
......
...@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false; ...@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
void NCCLWrapper::InitNCCL() { void NCCLWrapper::InitNCCL() {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
nccl_info_.my_global_rank_)); nccl_info_.my_global_rank_));
#endif #endif
...@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { ...@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
NCCLInfo NCCLWrapper::GetNCCLId() { NCCLInfo NCCLWrapper::GetNCCLId() {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
#endif #endif
return nccl_info_; return nccl_info_;
} }
...@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, ...@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
nccl_info_.local_rank_ = local_rank; nccl_info_.local_rank_ = local_rank;
nccl_info_.my_global_rank_ = global_rank; nccl_info_.my_global_rank_ = global_rank;
nccl_info_.global_ranks_ = ranks; nccl_info_.global_ranks_ = ranks;
PADDLE_ENFORCE(cudaSetDevice(local_rank)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_))); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
#endif #endif
return; return;
} }
...@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, ...@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
auto var = scope.FindVar(name); auto var = scope.FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
int32_t total_size = tensor->numel(); int32_t total_size = tensor->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat, reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
root_rank, nccl_info_.comm_, nccl_info_.stream_)); root_rank, nccl_info_.comm_, nccl_info_.stream_));
cudaStreamSynchronize(nccl_info_.stream_); cudaStreamSynchronize(nccl_info_.stream_);
......
...@@ -615,6 +615,16 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, ...@@ -615,6 +615,16 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
multihead_pattern); multihead_pattern);
// If weights or biases in qkv's fc are shared by multiple multihead_matmul
// patterns, we do not support this kind of fusion, this pass will not take
// effect.
bool is_fc_params_shared =
mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
if (is_fc_params_shared) {
return;
}
fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w, fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
reshape2_0, reshape2_qkv_out, scale, scale_out); reshape2_0, reshape2_qkv_out, scale, scale_out);
......
...@@ -19,13 +19,17 @@ namespace paddle { ...@@ -19,13 +19,17 @@ namespace paddle {
namespace framework { namespace framework {
extern size_t SizeOfType(proto::VarType::Type type); extern size_t SizeOfType(proto::VarType::Type type);
void Tensor::check_memory_size() const { void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); "Tensor holds no memory. "
"Call Tensor::mutable_data firstly."));
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
numel() * SizeOfType(type()), memory_size(), numel() * SizeOfType(type()), memory_size(),
"Tensor's dims_ is out of bound. Call Tensor::mutable_data " platform::errors::PreconditionNotMet(
"first to re-allocate memory.\n" "Tensor's dimension is out of bound."
"or maybe the required data-type mismatches the data already stored."); "Tensor's dimension must be equal or less than the size of its "
"memory."
"But received Tensor's dimension is d%, memory's size is %d.",
numel() * SizeOfType(type()), memory_size()));
} }
Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {} Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
...@@ -37,15 +41,21 @@ size_t Tensor::memory_size() const { ...@@ -37,15 +41,21 @@ size_t Tensor::memory_size() const {
void* Tensor::mutable_data(const platform::Place& place, void* Tensor::mutable_data(const platform::Place& place,
proto::VarType::Type type, size_t requested_size) { proto::VarType::Type type, size_t requested_size) {
type_ = type; type_ = type;
PADDLE_ENFORCE_GE(numel(), 0, PADDLE_ENFORCE_GE(
"When calling this method, the Tensor's numel must be " numel(), 0,
"equal or larger than zero. " platform::errors::PreconditionNotMet(
"Please check Tensor::dims, or Tensor::Resize has been " "The Tensor's element number must be equal or greater than zero. "
"called first. The Tensor's shape is [", "The Tensor's shape is [",
dims(), "] now"); dims(), "] now"));
size_t size = numel() * SizeOfType(type); size_t size = numel() * SizeOfType(type);
if (requested_size) { if (requested_size) {
PADDLE_ENFORCE_GE(requested_size, size); PADDLE_ENFORCE_GE(
requested_size, size,
platform::errors::InvalidArgument(
"The requested memory size is less than the memory size of Tensor. "
"But received requested memory size is d%, "
"memory size of Tensor is %d.",
requested_size, size));
size = requested_size; size = requested_size;
} }
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
...@@ -62,8 +72,8 @@ void* Tensor::mutable_data(const platform::Place& place, ...@@ -62,8 +72,8 @@ void* Tensor::mutable_data(const platform::Place& place,
void* Tensor::mutable_data(const platform::Place& place, void* Tensor::mutable_data(const platform::Place& place,
size_t requested_size) { size_t requested_size) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
this->holder_, "Cannot invoke mutable data if current hold nothing."); "The tensor is not initialized."));
return mutable_data(place, type_, requested_size); return mutable_data(place, type_, requested_size);
} }
...@@ -75,12 +85,20 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) { ...@@ -75,12 +85,20 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
check_memory_size(); check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0, PADDLE_ENFORCE_GE(
"The start row index must be greater than 0."); begin_idx, 0,
PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); platform::errors::OutOfRange("The start row index must be greater than 0."
"But received the start index is d%.",
begin_idx));
PADDLE_ENFORCE_LE(
end_idx, dims_[0],
platform::errors::OutOfRange("The end row index is out of bound."));
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
begin_idx, end_idx, begin_idx, end_idx,
"The start row index must be lesser than the end row index."); platform::errors::InvalidArgument(
"The start row index must be less than the end row index."
"But received the start index = %d, the end index = %d.",
begin_idx, end_idx));
if (dims_[0] == 1) { if (dims_[0] == 1) {
return *this; return *this;
......
...@@ -131,13 +131,17 @@ class Tensor { ...@@ -131,13 +131,17 @@ class Tensor {
const platform::Place& place() const { const platform::Place& place() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor not initialized yet when Tensor::place() is called."); holder_,
platform::errors::PreconditionNotMet(
"Tensor not initialized yet when Tensor::place() is called."));
return holder_->place(); return holder_->place();
} }
proto::VarType::Type type() const { proto::VarType::Type type() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor not initialized yet when Tensor::type() is called."); holder_,
platform::errors::PreconditionNotMet(
"Tensor not initialized yet when Tensor::type() is called."));
return type_; return type_;
} }
......
...@@ -43,9 +43,13 @@ inline T* Tensor::data() { ...@@ -43,9 +43,13 @@ inline T* Tensor::data() {
check_memory_size(); check_memory_size();
bool valid = bool valid =
std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType(); std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ(
valid, "Tensor holds the wrong type, it holds %s, but desires to be %s", valid, true,
DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType())); platform::errors::InvalidArgument(
"Tensor holds the wrong type, it holds %s, but desires to be %s",
DataTypeToString(type_),
DataTypeToString(DataTypeTrait<T>::DataType())));
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
...@@ -69,9 +73,12 @@ inline T* Tensor::mutable_data(const platform::Place& place, ...@@ -69,9 +73,12 @@ inline T* Tensor::mutable_data(const platform::Place& place,
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
int rank = src.dims().size(); int rank = src.dims().size();
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
rank, 2, rank, 2, platform::errors::InvalidArgument(
"'ReshapeToMatrix()' is only used for flatten high rank " "'ReshapeToMatrix()' is only used for flatten high rank "
"tensors to matrixs. Can not be used in reshaping vectors."); "tensors to matrixs. The dimensions of Tensor must be "
"greater or equal than 2. "
"But received dimensions of Tensor is %d",
rank));
if (rank == 2) { if (rank == 2) {
return src; return src;
} }
......
...@@ -41,7 +41,7 @@ TEST(Tensor, DataAssert) { ...@@ -41,7 +41,7 @@ TEST(Tensor, DataAssert) {
std::string ex_msg = err.what(); std::string ex_msg = err.what();
EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos); EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call " EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
"Tensor::mutable_data first.") != "Tensor::mutable_data firstly.") !=
std::string::npos); std::string::npos);
} }
ASSERT_TRUE(caught); ASSERT_TRUE(caught);
...@@ -157,7 +157,7 @@ TEST(Tensor, ShareDataWith) { ...@@ -157,7 +157,7 @@ TEST(Tensor, ShareDataWith) {
EXPECT_TRUE(ex_msg.find("holder_ should not be null") != EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
std::string::npos); std::string::npos);
EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call " EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
"Tensor::mutable_data first.") != "Tensor::mutable_data firstly.") !=
std::string::npos); std::string::npos);
} }
ASSERT_TRUE(caught); ASSERT_TRUE(caught);
......
...@@ -42,7 +42,8 @@ void ThreadPool::Init() { ...@@ -42,7 +42,8 @@ void ThreadPool::Init() {
num_threads = FLAGS_dist_threadpool_size; num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads; VLOG(1) << "set dist_threadpool_size to " << num_threads;
} }
PADDLE_ENFORCE_GT(num_threads, 0); PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
"The number of threads is 0."));
threadpool_.reset(new ThreadPool(num_threads)); threadpool_.reset(new ThreadPool(num_threads));
} }
} }
...@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() { ...@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
} }
if (tasks_.empty()) { if (tasks_.empty()) {
PADDLE_THROW("This thread has no task to Run"); PADDLE_THROW(platform::errors::Unavailable(
"Current thread has no task to Run."));
} }
// pop a task from the task queue // pop a task from the task queue
......
...@@ -91,7 +91,8 @@ class ThreadPool { ...@@ -91,7 +91,8 @@ class ThreadPool {
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
if (!running_) { if (!running_) {
PADDLE_THROW("enqueue on stopped ThreadPool"); PADDLE_THROW(platform::errors::Unavailable(
"Task is enqueued into stopped ThreadPool."));
} }
tasks_.push(std::move(task)); tasks_.push(std::move(task));
} }
......
...@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) { ...@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
} break; } break;
default: default:
PADDLE_THROW( PADDLE_THROW(
"Setting 'sub_tensor_number' is not supported by the type of var %s.", platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
this->Name()); "supported by the %s type variable.",
this->Name()));
} }
} }
...@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const { ...@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(
"Getting 'sub_tensor_number' is not supported by the type of var %s.", platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
this->Name()); "supported by the %s type variable.",
this->Name()));
} }
} }
...@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { ...@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level); desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_level' is not supported by the type of var %s.", "Setting 'lod_level' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
...@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) { ...@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
} }
} break; } break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_levels' is not supported by the type of var %s.", "Setting 'lod_levels' is not supported by the %s type variable",
this->Name()); this->Name()));
} }
} }
...@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const { ...@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
case proto::VarType::LOD_TENSOR_ARRAY: case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().lod_level(); return desc_.type().tensor_array().lod_level();
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_level' is not supported by the type of var %s.", "Getting 'lod_level' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
...@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const { ...@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
return res; return res;
break; break;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_levels' is not supported by the type of var %s.", "Getting 'lod_levels' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) { switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS: case proto::VarType::SELECTED_ROWS:
return desc_.type().selected_rows(); return desc_.type().selected_rows();
...@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { ...@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarType::LOD_TENSOR_ARRAY: case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().tensor(); return desc_.type().tensor_array().tensor();
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_desc' is not supported by the type of var %s.", "Getting 'tensor_desc' is not supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const { std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc> res; std::vector<proto::VarType::TensorDesc> res;
res.reserve(GetTensorDescNum()); res.reserve(GetTensorDescNum());
switch (desc_.type().type()) { switch (desc_.type().type()) {
...@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const { ...@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
} }
return res; return res;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the type of var " "Getting 'tensor_descs' is not supported by the %s type variable.",
"%s.", this->Name()));
this->Name());
} }
} }
proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) { switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS: case proto::VarType::SELECTED_ROWS:
return desc_.mutable_type()->mutable_selected_rows(); return desc_.mutable_type()->mutable_selected_rows();
...@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { ...@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
default: default:
PADDLE_THROW( PADDLE_THROW(
"Getting 'mutable_tensor_desc' is not supported by the type of var " platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
"%s.", "supported by the %s type variable.",
this->Name()); this->Name()));
} }
} }
std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() { std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc *> res; std::vector<proto::VarType::TensorDesc *> res;
res.reserve(GetTensorDescNum()); res.reserve(GetTensorDescNum());
switch (desc_.type().type()) { switch (desc_.type().type()) {
...@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() { ...@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
} }
return res; return res;
default: default:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the type of var " "Getting 'tensor_descs' is not supported by the %s type variable.",
"%s.", this->Name()));
this->Name());
} }
} }
......
...@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) { ...@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::READER: case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type); return static_cast<proto::VarType::Type>(type);
default: default:
PADDLE_THROW("ToVarType:Unsupported type %d", type); PADDLE_THROW(platform::errors::Unavailable(
"ToVarType method Unsupported type %d.", type));
} }
} }
...@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { ...@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
visitor(var.Get<FetchList>()); visitor(var.Get<FetchList>());
return; return;
default: default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
ToTypeName(var.Type())));
} }
} }
......
...@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl { ...@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
static_assert(!std::is_same<Type, void>::value, "Type cannot be void"); static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
constexpr int kId = VarTypeTrait<Type>::kId; constexpr int kId = VarTypeTrait<Type>::kId;
auto type = std::type_index(typeid(Type)); auto type = std::type_index(typeid(Type));
PADDLE_ENFORCE(id_to_type->count(kId) == 0, PADDLE_ENFORCE_EQ(
"Registered duplicate type id %d for type %s", kId, id_to_type->count(kId), 0,
type.name()); platform::errors::AlreadyExists(
PADDLE_ENFORCE(type_to_id->count(type) == 0, "Registered duplicate type id %d for type %s.", kId, type.name()));
"Registered duplicate type_index %s for id %d", type.name(), PADDLE_ENFORCE_EQ(
kId); type_to_id->count(type), 0,
platform::errors::AlreadyExists(
"Registered duplicate type index %s for id %d.", type.name(), kId));
id_to_type->emplace(kId, type); id_to_type->emplace(kId, type);
type_to_id->emplace(type, kId); type_to_id->emplace(type, kId);
VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd, VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
...@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder { ...@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
public: public:
static const std::type_index &ToTypeIndex(int var_id) { static const std::type_index &ToTypeIndex(int var_id) {
auto it = Instance().id_to_type_map_.find(var_id); auto it = Instance().id_to_type_map_.find(var_id);
PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
"VarId %d is not registered.", var_id); platform::errors::NotFound(
"Variable Id %d is not registered.", var_id));
return it->second; return it->second;
} }
static int ToTypeId(const std::type_index &type) { static int ToTypeId(const std::type_index &type) {
auto it = Instance().type_to_id_map_.find(type); auto it = Instance().type_to_id_map_.find(type);
PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
"VarType %s is not registered.", type.name()); platform::errors::NotFound(
"Variable Type %s is not registered.", type.name()));
return it->second; return it->second;
} }
......
...@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { ...@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::RAW) { } else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator // GetMutable will be called in operator
} else { } else {
PADDLE_THROW( PADDLE_THROW(platform::errors::Unavailable(
"Variable type %d is not in " "Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
var_type); var_type));
} }
} }
...@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) { ...@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
auto *dst_t = tmp_grad_slr->mutable_value(); auto *dst_t = tmp_grad_slr->mutable_value();
framework::TensorCopy(src_t, cpu_place, dst_t); framework::TensorCopy(src_t, cpu_place, dst_t);
} else { } else {
PADDLE_THROW("unknown var type to copy"); PADDLE_THROW(
platform::errors::Unavailable("Unknown variable type to copy."));
} }
} }
......
...@@ -218,6 +218,10 @@ struct Argument { ...@@ -218,6 +218,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
// Only used in paddle-lite subgraph.
DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
int);
private: private:
std::unordered_set<std::string> valid_fields_; std::unordered_set<std::string> valid_fields_;
}; };
......
...@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_xpu", new bool(argument->use_xpu())); pass->Set("use_xpu", new bool(argument->use_xpu()));
pass->Set("xpu_l3_workspace_size", pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size())); new int(argument->xpu_l3_workspace_size()));
pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads()));
} }
disable_logs_ = argument->disable_logs(); disable_logs_ = argument->disable_logs();
if (pass_name == "fc_fuse_pass") { if (pass_name == "fc_fuse_pass") {
......
...@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
bool enable_int8 = Get<bool>("enable_int8"); bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu"); bool use_xpu = Get<bool>("use_xpu");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size"); int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
lite_api::TargetType target_type; lite_api::TargetType target_type;
if (use_gpu) { if (use_gpu) {
...@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
// Notice: The ordering here determines the device where the // Notice: The ordering here determines the device where the
// input tensor of the Lite engine is located, and then affects // input tensor of the Lite engine is located, and then affects
// whether tensor sharing is feasible. // whether tensor sharing is feasible.
paddle::lite::Place({target_type, precision_type}), paddle::lite_api::Place({target_type, precision_type}),
paddle::lite::Place({target_type, PRECISION(kInt64)}), paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
paddle::lite::Place({target_type, PRECISION(kFloat)}), paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
}; };
config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size; config.xpu_l3_workspace_size = xpu_l3_workspace_size;
if (dump_model) { if (dump_model) {
lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./model.bin", config.model);
......
...@@ -53,12 +53,10 @@ if(WITH_TESTING) ...@@ -53,12 +53,10 @@ if(WITH_TESTING)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
elseif(WIN32) elseif(WIN32)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
endif() endif()
endif() endif()
......
...@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() {
} }
if (config_.lite_engine_enabled()) { if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads());
argument_.SetLitePrecisionMode(config_.lite_precision_mode_); argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
argument_.SetLitePassesFilter(config_.lite_passes_filter_); argument_.SetLitePassesFilter(config_.lite_passes_filter_);
argument_.SetLiteOpsFilter(config_.lite_ops_filter_); argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
......
...@@ -21,15 +21,21 @@ ...@@ -21,15 +21,21 @@
namespace paddle { namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) { void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
PADDLE_ENFORCE(!name_.empty(), PADDLE_ENFORCE_EQ(
"Need to SetName first, so that the corresponding tensor can " name_.empty(), false,
"be retrieved."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE(input_or_output_, "Need to SetName first, so that the corresponding tensor can "
"Can't reshape the output tensor, it is readonly"); "be retrieved."));
PADDLE_ENFORCE(scope_); PADDLE_ENFORCE_EQ(input_or_output_, true,
platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly"));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_); auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(shape)); tensor->Resize(framework::make_ddim(shape));
} }
...@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { ...@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
tensor->numel(), 0, tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)" platform::errors::PreconditionNotMet(
"function before retrieving mutable_data from input tensor."); "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
"&shape)"
"function before retrieving mutable_data from input tensor."));
switch (static_cast<int>(place)) { switch (static_cast<int>(place)) {
case static_cast<int>(PaddlePlace::kCPU): { case static_cast<int>(PaddlePlace::kCPU): {
return tensor->mutable_data<T>(platform::CPUPlace()); return tensor->mutable_data<T>(platform::CPUPlace());
...@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { ...@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
return tensor->mutable_data<T>(platform::CUDAPlace(device_)); return tensor->mutable_data<T>(platform::CUDAPlace(device_));
} }
default: default:
PADDLE_THROW("Unsupported place: %d", static_cast<int>(place)); PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
static_cast<int>(place)));
break; break;
} }
return nullptr; return nullptr;
...@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const { ...@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
template <typename T> template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) { void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(tensor->numel(), 0,
tensor->numel(), 0, platform::errors::PreconditionNotMet(
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)" "You should call ZeroCopyTensor::Reshape(const "
"function before copying data from cpu."); "std::vector<int> &shape)"
"function before copying data from cpu."));
size_t ele_size = tensor->numel() * sizeof(T); size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) { if (place_ == PaddlePlace::kCPU) {
...@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { ...@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(), memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream()); data, ele_size, dev_ctx->stream());
#else #else
PADDLE_THROW("Not compiled with CUDA, should not reach here."); PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with CUDA, should not reach here."));
#endif #endif
} }
} }
...@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
cudaStreamSynchronize(dev_ctx->stream()); cudaStreamSynchronize(dev_ctx->stream());
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here."));
#endif #endif
} }
} }
...@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>( ...@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
PaddlePlace place); PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(), PADDLE_ENFORCE_EQ(
"Need to SetName first, so that the corresponding tensor can " name_.empty(), false,
"be retrieved."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE(scope_); "Need to SetName first, so that the corresponding tensor can "
"be retrieved."));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_); auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
return tensor; return tensor;
} }
std::vector<int> ZeroCopyTensor::shape() const { std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); PADDLE_ENFORCE_NOT_NULL(
tensor_, platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_));
return framework::vectorize<int>(tensor->dims()); return framework::vectorize<int>(tensor->dims());
} }
......
...@@ -31,12 +31,30 @@ limitations under the License. */ ...@@ -31,12 +31,30 @@ limitations under the License. */
#include "paddle_analysis_config.h" // NOLINT #include "paddle_analysis_config.h" // NOLINT
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
///
/// \file paddle_inference_api.h
///
/// \brief Paddle Inference API
///
/// \author paddle-infer@baidu.com
/// \date 2020-09-01
/// \since 2.0.0-beta
///
namespace paddle_infer { namespace paddle_infer {
using DataType = paddle::PaddleDType; using DataType = paddle::PaddleDType;
using PlaceType = paddle::PaddlePlace; using PlaceType = paddle::PaddlePlace;
using PrecisionType = paddle::AnalysisConfig::Precision; using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig; using Config = paddle::AnalysisConfig;
///
/// \class Tensor
///
/// \brief Represents an n-dimensional array of values.
/// The Tensor is used to store the input or output of the network.
/// It is obtained through Predictor::GetinputHandle()
/// and Predictor::GetOutputHandle() interface.
///
class PD_INFER_DECL Tensor { class PD_INFER_DECL Tensor {
public: public:
// Can only be created by predictor->GetInputHandle(cosnt std::string& name) // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
...@@ -44,60 +62,186 @@ class PD_INFER_DECL Tensor { ...@@ -44,60 +62,186 @@ class PD_INFER_DECL Tensor {
Tensor() = delete; Tensor() = delete;
explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor) explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
: tensor_(std::move(tensor)) {} : tensor_(std::move(tensor)) {}
///
/// \brief Reset the shape of the tensor.
/// Generally it's only used for the input tensor.
/// Reshape must be called before calling mutable_data() or CopyFromCpu()
/// \param shape The shape to set.
///
void Reshape(const std::vector<int>& shape); void Reshape(const std::vector<int>& shape);
///
/// \brief Copy the host memory to tensor data.
/// It's usually used to set the input tensor data.
/// \param data The pointer of the data, from which the tensor will copy.
///
template <typename T> template <typename T>
void CopyFromCpu(const T* data); void CopyFromCpu(const T* data);
// should add the place ///
/// \brief Get the memory pointer in CPU or GPU with specific data type.
/// Please Reshape the tensor first before call this.
/// It's usually used to get input data pointer.
/// \param place The place of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T> template <typename T>
T* mutable_data(PlaceType place); T* mutable_data(PlaceType place);
///
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
///
template <typename T> template <typename T>
void CopyToCpu(T* data); void CopyToCpu(T* data);
///
/// \brief Get the memory pointer directly.
/// It's usually used to get the output data pointer.
/// \param[out] place To get the device type of the tensor.
/// \param[out] size To get the data size of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T> template <typename T>
T* data(PlaceType* place, int* size) const; T* data(PlaceType* place, int* size) const;
///
/// \brief Set lod info of the tensor.
/// More about LOD can be seen here:
/// https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
/// \param x the lod info.
///
void SetLoD(const std::vector<std::vector<size_t>>& x); void SetLoD(const std::vector<std::vector<size_t>>& x);
/// \brief Return the lod info of the tensor.
std::vector<std::vector<size_t>> lod() const; std::vector<std::vector<size_t>> lod() const;
/// \brief Return the data type of the tensor.
/// It's usually used to get the output tensor data type.
/// \return The data type of the tensor.
DataType type() const; DataType type() const;
/// \brief Return the shape of the Tensor.
std::vector<int> shape() const; std::vector<int> shape() const;
/// \brief Return the name of the tensor.
const std::string& name() const; const std::string& name() const;
private: private:
std::unique_ptr<paddle::ZeroCopyTensor> tensor_; std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
}; };
///
/// \class Predictor
///
/// \brief Predictor is the interface for model prediction.
///
/// The predictor has the following typical uses:
///
/// Get predictor
/// \code{cpp}
/// auto predictor = CreatePredictor(config);
/// \endcode
///
/// Get input or output names
/// \code{cpp}
/// auto input_names = predictor->GetInputNames();
/// auto output_names = predictor->GetOutputNames();
/// \endcode
///
/// Get input or output handle
/// \code{cpp}
/// auto input_t = predictor->GetInputHandle(input_names[0]);
/// auto output_t = predictor->GetOutputHandle(output_names[0]);
/// \endcode
///
/// Run predictor
/// \code{cpp}
/// predictor->Run();
/// \endcode
///
class PD_INFER_DECL Predictor { class PD_INFER_DECL Predictor {
public: public:
Predictor() = default; Predictor() = delete;
~Predictor() {} ~Predictor() {}
// Use for clone // Use for clone
explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred) explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
: predictor_(std::move(pred)) {} : predictor_(std::move(pred)) {}
///
/// \brief Construct a new Predictor object
///
/// \param[in] Config config
///
explicit Predictor(const Config& config); explicit Predictor(const Config& config);
///
/// \brief Get the input names
///
/// \return input names
///
std::vector<std::string> GetInputNames(); std::vector<std::string> GetInputNames();
///
/// \brief Get the Input Tensor object
///
/// \param[in] name input name
/// \return input tensor
///
std::unique_ptr<Tensor> GetInputHandle(const std::string& name); std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
///
/// \brief Run the prediction engine
///
/// \return Whether the function executed successfully
///
bool Run(); bool Run();
///
/// \brief Get the output names
///
/// \return output names
///
std::vector<std::string> GetOutputNames(); std::vector<std::string> GetOutputNames();
///
/// \brief Get the Output Tensor object
///
/// \param[in] name otuput name
/// \return output tensor
///
std::unique_ptr<Tensor> GetOutputHandle(const std::string& name); std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
///
/// \brief Clone to get the new predictor. thread safe.
///
/// \return get a new predictor
///
std::unique_ptr<Predictor> Clone(); std::unique_ptr<Predictor> Clone();
/// \brief Clear the intermediate tensors of the predictor
void ClearIntermediateTensor(); void ClearIntermediateTensor();
private: private:
std::unique_ptr<paddle::PaddlePredictor> predictor_; std::unique_ptr<paddle::PaddlePredictor> predictor_;
}; };
///
/// \brief A factory to help create predictors.
///
/// Usage:
///
/// \code{.cpp}
/// Config config;
/// ... // change the configs.
/// auto predictor = CreatePredictor(config);
/// \endcode
///
PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor( PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
const Config& config); // NOLINT const Config& config); // NOLINT
PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype); PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
PD_INFER_DECL std::string GetVersion(); PD_INFER_DECL std::string GetVersion();
...@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const { ...@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
namespace paddle_infer { namespace paddle_infer {
namespace services { namespace services {
///
/// \class PredictorPool
///
/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
/// use in multi-threaded situations. According to the thread id, the
/// corresponding Predictor is taken out from PredictorPool to complete the
/// prediction.
///
class PD_INFER_DECL PredictorPool { class PD_INFER_DECL PredictorPool {
public: public:
PredictorPool() = delete; PredictorPool() = delete;
PredictorPool(const PredictorPool&) = delete; PredictorPool(const PredictorPool&) = delete;
PredictorPool& operator=(const PredictorPool&) = delete; PredictorPool& operator=(const PredictorPool&) = delete;
/// \brief Construct the predictor pool with \param size predictor instances.
explicit PredictorPool(const Config& config, size_t size = 1); explicit PredictorPool(const Config& config, size_t size = 1);
/// \brief Get \param id-th predictor.
Predictor* Retrive(size_t idx); Predictor* Retrive(size_t idx);
private: private:
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) { ...@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
} }
void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) { void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Resize(length); buf->buf.Resize(length);
} }
void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) { void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Reset(data, length); buf->buf.Reset(data, length);
} }
bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) { bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.empty(); return buf->buf.empty();
} }
void* PD_PaddleBufData(PD_PaddleBuf* buf) { void* PD_PaddleBufData(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.data(); return buf->buf.data();
} }
size_t PD_PaddleBufLength(PD_PaddleBuf* buf) { size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf); PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.length(); return buf->buf.length();
} }
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using PD_PaddleDType = paddle::PaddleDType; using PD_PaddleDType = paddle::PaddleDType;
using PD_ACPrecision = paddle::AnalysisConfig::Precision; using PD_ACPrecision = paddle::AnalysisConfig::Precision;
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) { ...@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir, void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
const char* params_path) { const char* params_path) {
LOG(INFO) << model_dir; LOG(INFO) << model_dir;
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
LOG(INFO) << std::string(model_dir); LOG(INFO) << std::string(model_dir);
if (!params_path) { if (!params_path) {
config->config.SetModel(std::string(model_dir)); config->config.SetModel(std::string(model_dir));
...@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir, ...@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
} }
void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) { void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetProgFile(std::string(x)); config->config.SetProgFile(std::string(x));
} }
void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) { void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetParamsFile(std::string(x)); config->config.SetParamsFile(std::string(x));
} }
void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) { void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetOptimCacheDir(std::string(opt_cache_dir)); config->config.SetOptimCacheDir(std::string(opt_cache_dir));
} }
const char* PD_ModelDir(const PD_AnalysisConfig* config) { const char* PD_ModelDir(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_dir().c_str(); return config->config.model_dir().c_str();
} }
const char* PD_ProgFile(const PD_AnalysisConfig* config) { const char* PD_ProgFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.prog_file().c_str(); return config->config.prog_file().c_str();
} }
const char* PD_ParamsFile(const PD_AnalysisConfig* config) { const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.params_file().c_str(); return config->config.params_file().c_str();
} }
void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb, void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
int device_id) { int device_id) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb), config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
device_id); device_id);
} }
void PD_DisableGpu(PD_AnalysisConfig* config) { void PD_DisableGpu(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.DisableGpu(); config->config.DisableGpu();
} }
bool PD_UseGpu(const PD_AnalysisConfig* config) { bool PD_UseGpu(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_gpu(); return config->config.use_gpu();
} }
int PD_GpuDeviceId(const PD_AnalysisConfig* config) { int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.gpu_device_id(); return config->config.gpu_device_id();
} }
int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) { int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.memory_pool_init_size_mb(); return config->config.memory_pool_init_size_mb();
} }
float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) { float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.fraction_of_gpu_memory_for_pool(); return config->config.fraction_of_gpu_memory_for_pool();
} }
void PD_EnableCUDNN(PD_AnalysisConfig* config) { void PD_EnableCUDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableCUDNN(); config->config.EnableCUDNN();
} }
bool PD_CudnnEnabled(const PD_AnalysisConfig* config) { bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cudnn_enabled(); return config->config.cudnn_enabled();
} }
void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) { void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrOptim(x); config->config.SwitchIrOptim(x);
} }
bool PD_IrOptim(const PD_AnalysisConfig* config) { bool PD_IrOptim(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.ir_optim(); return config->config.ir_optim();
} }
void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) { void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchUseFeedFetchOps(x); config->config.SwitchUseFeedFetchOps(x);
} }
bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) { bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_feed_fetch_ops_enabled(); return config->config.use_feed_fetch_ops_enabled();
} }
void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) { void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchSpecifyInputNames(x); config->config.SwitchSpecifyInputNames(x);
} }
bool PD_SpecifyInputName(const PD_AnalysisConfig* config) { bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.specify_input_name(); return config->config.specify_input_name();
} }
...@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size, ...@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
int max_batch_size, int min_subgraph_size, int max_batch_size, int min_subgraph_size,
Precision precision, bool use_static, Precision precision, bool use_static,
bool use_calib_mode) { bool use_calib_mode) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableTensorRtEngine( config->config.EnableTensorRtEngine(
workspace_size, max_batch_size, min_subgraph_size, workspace_size, max_batch_size, min_subgraph_size,
paddle::ConvertToACPrecision(precision), use_static, use_calib_mode); paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
} }
bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.tensorrt_engine_enabled(); return config->config.tensorrt_engine_enabled();
} }
void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrDebug(x); config->config.SwitchIrDebug(x);
} }
void PD_EnableMKLDNN(PD_AnalysisConfig* config) { void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMKLDNN(); config->config.EnableMKLDNN();
} }
void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) { void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetMkldnnCacheCapacity(capacity); config->config.SetMkldnnCacheCapacity(capacity);
} }
bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) { bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_enabled(); return config->config.mkldnn_enabled();
} }
void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config, void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
int cpu_math_library_num_threads) { int cpu_math_library_num_threads) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads); config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
} }
int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) { int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cpu_math_library_num_threads(); return config->config.cpu_math_library_num_threads();
} }
void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) { void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnQuantizer(); config->config.EnableMkldnnQuantizer();
} }
bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) { bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_quantizer_enabled(); return config->config.mkldnn_quantizer_enabled();
} }
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) { void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound( PADDLE_ENFORCE_NOT_NULL(
"PD_AnalysisConfig should not be null")); config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnBfloat16(); config->config.EnableMkldnnBfloat16();
} }
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) { bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound( PADDLE_ENFORCE_NOT_NULL(
"PD_AnalysisConfig should not be null")); config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_bfloat16_enabled(); return config->config.mkldnn_bfloat16_enabled();
} }
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer, void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer, size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) { size_t params_buffer_size) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer, config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
params_buffer_size); params_buffer_size);
} }
bool PD_ModelFromMemory(const PD_AnalysisConfig* config) { bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_from_memory(); return config->config.model_from_memory();
} }
void PD_EnableMemoryOptim(PD_AnalysisConfig* config) { void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMemoryOptim(); config->config.EnableMemoryOptim();
} }
bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) { bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.enable_memory_optim(); return config->config.enable_memory_optim();
} }
void PD_EnableProfile(PD_AnalysisConfig* config) { void PD_EnableProfile(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableProfile(); config->config.EnableProfile();
} }
bool PD_ProfileEnabled(const PD_AnalysisConfig* config) { bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.profile_enabled(); return config->config.profile_enabled();
} }
void PD_SetInValid(PD_AnalysisConfig* config) { void PD_SetInValid(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetInValid(); config->config.SetInValid();
} }
bool PD_IsValid(const PD_AnalysisConfig* config) { bool PD_IsValid(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.is_valid(); return config->config.is_valid();
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/inference/api/paddle_api.h" #include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -81,7 +82,10 @@ extern "C" { ...@@ -81,7 +82,10 @@ extern "C" {
bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs, bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
int in_size, PD_Tensor** output_data, int* out_size, int in_size, PD_Tensor** output_data, int* out_size,
int batch_size) { int batch_size) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
VLOG(3) << "Predoctor: PD_PredictorRun. "; VLOG(3) << "Predoctor: PD_PredictorRun. ";
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>> static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors; predictors;
...@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs, ...@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
PD_ZeroCopyData* inputs, int in_size, PD_ZeroCopyData* inputs, int in_size,
PD_ZeroCopyData** output, int* out_size) { PD_ZeroCopyData** output, int* out_size) {
PADDLE_ENFORCE_NOT_NULL(config); PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>> static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors; predictors;
if (!predictors.count(config->config.model_dir())) { if (!predictors.count(config->config.model_dir())) {
...@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, ...@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data)); input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
break; break;
default: default:
CHECK(false) << "Unsupport data type."; PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unsupported data type."));
break; break;
} }
} }
...@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor, ...@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data)); input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
break; break;
default: default:
CHECK(false) << "Unsupport data type."; PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break; break;
} }
...@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) { ...@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data)); output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
break; break;
default: default:
CHECK(false) << "Unsupport data type."; PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break; break;
} }
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h" #include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h" #include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision; using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType; using paddle::ConvertToPaddleDType;
...@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) { ...@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
} }
void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) { void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.name = std::string(name); tensor->tensor.name = std::string(name);
} }
void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) { void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype); tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
} }
void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) { void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.data = buf->buf; tensor->tensor.data = buf->buf;
} }
void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) { void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.shape.assign(shape, shape + size); tensor->tensor.shape.assign(shape, shape + size);
} }
const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) { const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return tensor->tensor.name.c_str(); return tensor->tensor.name.c_str();
} }
PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) { PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return ConvertToPDDataType(tensor->tensor.dtype); return ConvertToPDDataType(tensor->tensor.dtype);
} }
PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) { PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
PD_PaddleBuf* ret = PD_NewPaddleBuf(); PD_PaddleBuf* ret = PD_NewPaddleBuf();
ret->buf = tensor->tensor.data; ret->buf = tensor->tensor.data;
return ret; return ret;
} }
const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) { const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
PADDLE_ENFORCE_NOT_NULL(tensor); PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
const std::vector<int>& shape = tensor->tensor.shape; const std::vector<int>& shape = tensor->tensor.shape;
*size = shape.size(); *size = shape.size();
return shape.data(); return shape.data();
......
...@@ -20,8 +20,12 @@ ...@@ -20,8 +20,12 @@
#define LITE_WITH_XPU 1 #define LITE_WITH_XPU 1
#endif #endif
#ifndef PADDLE_WITH_ARM
#define LITE_WITH_X86 1
#endif
#include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/engine.h"
#include "lite/api/paddle_use_passes.h" #include <utility>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const { ...@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
return engines_.at(name).get() != nullptr; return engines_.at(name).get() != nullptr;
} }
paddle::lite::Predictor* EngineManager::Get(const std::string& name) const { paddle::lite_api::PaddlePredictor* EngineManager::Get(
const std::string& name) const {
return engines_.at(name).get(); return engines_.at(name).get();
} }
paddle::lite::Predictor* EngineManager::Create(const std::string& name, paddle::lite_api::PaddlePredictor* EngineManager::Create(
const EngineConfig& cfg) { const std::string& name, const EngineConfig& cfg) {
if (cfg.valid_places.front().target == TARGET(kCUDA)) { // config info for predictor.
#ifdef PADDLE_WITH_CUDA paddle::lite_api::CxxConfig lite_cxx_config;
paddle::lite::Env<TARGET(kCUDA)>::Init(); lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
cfg.param.c_str(), cfg.param.size());
lite_cxx_config.set_valid_places(cfg.valid_places);
#ifdef PADDLE_WITH_ARM
set_threads.set_threads(cfg.cpu_math_library_num_threads);
#else
lite_cxx_config.set_x86_math_library_num_threads(
cfg.cpu_math_library_num_threads);
#endif #endif
} else if (cfg.valid_places.front().target == TARGET(kXPU)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread = lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
cfg.xpu_l3_workspace_size; cfg.xpu_l3_workspace_size);
#endif #endif
}
auto* p = new paddle::lite::Predictor(); // create predictor
p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes, std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
cfg.model_type, cfg.model_from_memory); paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
engines_[name].reset(p); engines_[name] = std::move(p);
return p; return engines_[name].get();
} }
void EngineManager::DeleteAll() { void EngineManager::DeleteAll() {
for (auto& item : engines_) { for (auto& item : engines_) {
item.second.reset(nullptr); item.second.reset();
} }
} }
......
...@@ -23,12 +23,9 @@ ...@@ -23,12 +23,9 @@
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wall" #pragma GCC diagnostic ignored "-Wall"
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/core/context.h" #include "lite/api/paddle_use_passes.h"
#include "lite/core/device_info.h"
#include "lite/core/memory.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
namespace paddle { namespace paddle {
...@@ -38,25 +35,33 @@ namespace lite { ...@@ -38,25 +35,33 @@ namespace lite {
struct EngineConfig { struct EngineConfig {
std::string model; std::string model;
std::string param; std::string param;
paddle::lite::Place prefer_place; std::vector<paddle::lite_api::Place> valid_places;
std::vector<paddle::lite::Place> valid_places;
std::vector<std::string> neglected_passes; std::vector<std::string> neglected_passes;
lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
bool model_from_memory{true}; bool model_from_memory{true};
// for xpu
size_t xpu_l3_workspace_size; size_t xpu_l3_workspace_size;
// for x86 or arm
int cpu_math_library_num_threads{1};
// for cuda
bool use_multi_stream{false};
}; };
class EngineManager { class EngineManager {
public: public:
bool Empty() const; bool Empty() const;
bool Has(const std::string& name) const; bool Has(const std::string& name) const;
paddle::lite::Predictor* Get(const std::string& name) const; paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
paddle::lite::Predictor* Create(const std::string& name, paddle::lite_api::PaddlePredictor* Create(const std::string& name,
const EngineConfig& cfg); const EngineConfig& cfg);
void DeleteAll(); void DeleteAll();
private: private:
std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>> std::unordered_map<std::string,
std::shared_ptr<paddle::lite_api::PaddlePredictor>>
engines_; engines_;
}; };
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/lite/tensor_utils.h" #include "paddle/fluid/inference/lite/tensor_utils.h"
#include <functional>
#include <map> #include <map>
#include <memory> #include <memory>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, ...@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
} }
} }
void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) { void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
PrecisionType precision_type,
TargetType target_type) {
void* res{nullptr};
switch (precision_type) {
case PrecisionType::kFloat:
res = static_cast<void*>(src->mutable_data<float>(target_type));
break;
case PrecisionType::kInt8:
res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
break;
case PrecisionType::kInt32:
res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
break;
case PrecisionType::kInt64:
res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported precision type. Now only supports FP32, INT8, INT32 and "
"INT64."));
break;
}
return res;
}
int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
auto shape = tensor.shape();
int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
return numel;
}
void InitDstTensor(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src) {
// Currently, Lite needs to explicitly specify the target type of // Currently, Lite needs to explicitly specify the target type of
// the input tensor. // the input tensor.
constexpr int empty_size = 0; constexpr int empty_size = 0;
dst->mutable_data(GetLiteTargetType(src.place()), empty_size); dst->Resize({empty_size});
dst->set_precision(GetLitePrecisionType(src.type())); GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
SetLoD(dst->mutable_lod(), src.lod()); GetLiteTargetType(src.place()));
dst->SetPrecision(GetLitePrecisionType(src.type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src.lod());
dst->SetLoD(lite_lod);
} }
void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { void InitDstTensor(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src) {
constexpr framework::proto::VarType::Type dtype = constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32; framework::proto::VarType_Type_FP32;
dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()), dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
...@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) { ...@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
} }
template <> template <>
void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, void TensorCopyAsync(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src,
const platform::DeviceContext& ctx) { const platform::DeviceContext& ctx) {
InitDstTensor(dst, src); InitDstTensor(dst, src);
const platform::Place& src_place = src.place(); const platform::Place& src_place = src.place();
...@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src, ...@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type()); static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
dst->Resize(framework::vectorize(src.dims())); dst->Resize(framework::vectorize(src.dims()));
const void* src_data = src.data<void>(); const void* src_data = src.data<void>();
void* dst_data = dst->mutable_data(bytes); void* dst_data{nullptr};
dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
GetLiteTargetType(src.place()));
VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << src.type(); << ", dst = " << dst << ", src_type = " << src.type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size(); VLOG(3) << "[Lite memory size] Bytes = " << bytes;
} }
template <> template <>
void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src, void TensorCopyAsync(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src,
const platform::DeviceContext& ctx) { const platform::DeviceContext& ctx) {
dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize())); dst->Resize(paddle::framework::make_ddim(src.shape()));
InitDstTensor(dst, src); InitDstTensor(dst, src);
const platform::Place& src_place = GetNativePlace(src.target()); const platform::Place& src_place = GetNativePlace(src.target());
const platform::Place& dst_place = dst->place(); const platform::Place& dst_place = dst->place();
const size_t bytes = int64_t src_numel = GetLiteTensorNumel(src);
static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type()); const size_t bytes = src_numel * framework::SizeOfType(dst->type());
const void* src_data = src.raw_data(); const void* src_data = src.data<void>();
// When Lite is ready, the source type needs to be modified here. // When Lite is ready, the source type needs to be modified here.
void* dst_data = dst->mutable_data(dst_place, dst->type()); void* dst_data = dst->mutable_data(dst_place, dst->type());
VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << dst->type(); << ", dst = " << dst << ", src_type = " << dst->type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx); MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size(); VLOG(3) << "[Lite memory size] Bytes = " << bytes;
} }
template <> template <>
void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) { void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
const size_t bytes =
static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
dst->Resize(framework::vectorize(src->dims())); dst->Resize(framework::vectorize(src->dims()));
dst->set_precision(GetLitePrecisionType(src->type())); dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
SetLoD(dst->mutable_lod(), src->lod()); GetLiteTargetType(src->place()));
dst->ResetBuffer(buf, bytes); dst->SetPrecision(GetLitePrecisionType(src->type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src->lod());
dst->SetLoD(lite_lod);
} }
template <> template <>
void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) { void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
constexpr framework::proto::VarType::Type dtype = constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32; framework::proto::VarType_Type_FP32;
void* src_raw_data = src->raw_data(); void* src_raw_data =
GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
std::shared_ptr<memory::allocation::Allocation> holder( std::shared_ptr<memory::allocation::Allocation> holder(
new memory::allocation::Allocation(src_raw_data, src->memory_size(), new memory::allocation::Allocation(src_raw_data, memory_size,
GetNativePlace(src->target()))); GetNativePlace(src->target())));
dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize())); dst->Resize(paddle::framework::make_ddim(src->shape()));
SetLoD(dst->mutable_lod(), src->lod()); SetLoD(dst->mutable_lod(), src->lod());
dst->ResetHolderWithType(holder, dtype); dst->ResetHolderWithType(holder, dtype);
} }
......
...@@ -102,10 +102,10 @@ TEST(EngineManager, engine) { ...@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
config.model_from_memory = true; config.model_from_memory = true;
config.valid_places = { config.valid_places = {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif #endif
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}), paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}), paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
}; };
LOG(INFO) << "Create EngineManager"; LOG(INFO) << "Create EngineManager";
...@@ -118,7 +118,7 @@ TEST(EngineManager, engine) { ...@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has( ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
unique_key), unique_key),
true); true);
paddle::lite::Predictor* engine_0 = paddle::lite_api::PaddlePredictor* engine_0 =
inference::Singleton<inference::lite::EngineManager>::Global().Get( inference::Singleton<inference::lite::EngineManager>::Global().Get(
unique_key); unique_key);
CHECK_NOTNULL(engine_0); CHECK_NOTNULL(engine_0);
......
...@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) { ...@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC)); EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
} }
template <typename T>
void test_lite_tensor_data_ptr(PrecisionType precision_type) {
void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
PrecisionType precision_type,
TargetType target_type);
const int count = 4;
paddle::lite::Tensor lite_tensor;
lite_tensor.Resize({count});
auto* lite_tensor_data = lite_tensor.mutable_data<T>();
for (size_t i = 0; i < count; ++i) {
lite_tensor_data[i] = i;
}
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
T* data = static_cast<T*>(GetLiteTensorDataPtr(
&lite_api_tensor, precision_type, TargetType::kHost));
for (size_t i = 0; i < count; ++i) {
CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
}
}
TEST(LiteEngineOp, GetLiteTensorDataPtr) {
test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
}
void test_tensor_copy(const platform::DeviceContext& ctx) { void test_tensor_copy(const platform::DeviceContext& ctx) {
// Create LoDTensor. // Create LoDTensor.
std::vector<float> vector({1, 2, 3, 4}); std::vector<float> vector({1, 2, 3, 4});
...@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { ...@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
// Create lite::Tensor and copy. // Create lite::Tensor and copy.
paddle::lite::Tensor lite_tensor; paddle::lite::Tensor lite_tensor;
TensorCopyAsync(&lite_tensor, lod_tensor, ctx); paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
// Copy to LoDTensor. // Copy to LoDTensor.
framework::LoDTensor lod_tensor_n; framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync( platform::GpuStreamSync(
...@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) { ...@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
// Create lite::Tensor and share. // Create lite::Tensor and share.
paddle::lite::Tensor lite_tensor; paddle::lite::Tensor lite_tensor;
TensorDataShare(&lite_tensor, &lod_tensor); paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorDataShare(&lite_api_tensor, &lod_tensor);
// Copy to LoDTensor. // Copy to LoDTensor.
framework::LoDTensor lod_tensor_n; framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx); TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
std::vector<float> result; std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result); TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector); ASSERT_EQ(result, vector);
......
...@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers, ...@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
void TensorRTEngine::FreezeNetwork() { void TensorRTEngine::FreezeNetwork() {
freshDeviceId(); freshDeviceId();
VLOG(3) << "TRT to freeze network"; VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE(infer_builder_ != nullptr, PADDLE_ENFORCE_NOT_NULL(infer_builder_,
"Call InitNetwork first to initialize network."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(network() != nullptr, true, "Inference builder of TRT is null. Please make "
platform::errors::InvalidArgument( "sure you call InitNetwork first."));
"Call InitNetwork first to initialize network.")); PADDLE_ENFORCE_NOT_NULL(network(),
platform::errors::InvalidArgument(
"Call InitNetwork first to initialize network."));
// build engine. // build engine.
infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxBatchSize(max_batch_);
infer_builder_->setMaxWorkspaceSize(max_workspace_); infer_builder_->setMaxWorkspaceSize(max_workspace_);
...@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
} else { } else {
infer_engine_.reset(infer_builder_->buildCudaEngine(*network())); infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
} }
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); PADDLE_ENFORCE_NOT_NULL(
infer_engine_, platform::errors::Fatal(
"Build TensorRT cuda engine failed! Please recheck "
"you configurations related to paddle-TensorRT."));
} }
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
...@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, ...@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The TRT network should be initialized first.")); "The TRT network should be initialized first."));
auto *input = network()->addInput(name.c_str(), dtype, dims); auto *input = network()->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE(input->isNetworkInput()); input, platform::errors::InvalidArgument("Adding input %s failed in "
"TensorRT inference network. "
"Please recheck your input.",
name));
PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
platform::errors::InvalidArgument(
"Input %s is not the input of TRT inference network. "
"Please recheck your input.",
name));
TensorRTEngine::SetITensor(name, input); TensorRTEngine::SetITensor(name, input);
return input; return input;
} }
...@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, ...@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string &name) { const std::string &name) {
auto *output = layer->getOutput(offset); auto *output = layer->getOutput(offset);
SetITensor(name, output); SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr); PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str()); output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput()); PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output); network()->markOutput(*output);
PADDLE_ENFORCE(output->isNetworkOutput()); PADDLE_ENFORCE_EQ(
output->isNetworkOutput(), true,
platform::errors::InvalidArgument(
"The output %s of TRT engine should be the output of the network.",
name));
} }
void TensorRTEngine::DeclareOutput(const std::string &name) { void TensorRTEngine::DeclareOutput(const std::string &name) {
auto *output = TensorRTEngine::GetITensor(name); auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr); PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str()); output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput()); PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output); network()->markOutput(*output);
} }
void TensorRTEngine::SetITensor(const std::string &name, void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) { nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", tensor, platform::errors::InvalidArgument(
name); "Tensor named %s of TRT engine should not be null.", name));
PADDLE_ENFORCE_EQ(
0, itensor_map_.count(name),
platform::errors::InvalidArgument(
"Tensor named %s of TRT engine should not be duplicated", name));
itensor_map_[name] = tensor; itensor_map_[name] = tensor;
} }
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
platform::errors::NotFound(
"Tensor named %s is not found in TRT engine", name));
return itensor_map_[name]; return itensor_map_[name];
} }
...@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, ...@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
std::string splitter = "__"; std::string splitter = "__";
std::string name_with_suffix = name + splitter + name_suffix; std::string name_with_suffix = name + splitter + name_suffix;
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
weight_map.count(name_with_suffix), 0, platform::errors::AlreadyExists(
"During TRT Op converter: We set weight %s with the same name " "The weight named %s is set into the weight map "
"twice into the weight_map", "twice in TRT OP converter.",
name_with_suffix); name_with_suffix));
weight_map[name_with_suffix].reset(new framework::Tensor()); weight_map[name_with_suffix].reset(new framework::Tensor());
weight_map[name_with_suffix]->Resize(weight_tensor->dims()); weight_map[name_with_suffix]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get()); TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
...@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( ...@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
void TensorRTEngine::freshDeviceId() { void TensorRTEngine::freshDeviceId() {
int count; int count;
cudaGetDeviceCount(&count); cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, count); PADDLE_ENFORCE_LT(device_id_, count,
platform::errors::OutOfRange(
"Device id %d exceeds the current device count: %d.",
device_id_, count));
cudaSetDevice(device_id_); cudaSetDevice(device_id_);
} }
......
...@@ -196,8 +196,10 @@ class TensorRTEngine { ...@@ -196,8 +196,10 @@ class TensorRTEngine {
} }
nvinfer1::IHostMemory* Serialize() { nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE(infer_engine_ != nullptr, PADDLE_ENFORCE_NOT_NULL(
"You should build engine first and then serialize"); infer_engine_,
platform::errors::InvalidArgument(
"The TensorRT engine must be built first before serialization"));
ihost_memory_.reset(infer_engine_->serialize()); ihost_memory_.reset(infer_engine_->serialize());
return ihost_memory_.get(); return ihost_memory_.get();
} }
...@@ -222,8 +224,14 @@ class TensorRTEngine { ...@@ -222,8 +224,14 @@ class TensorRTEngine {
engine_serialized_data.c_str(), engine_serialized_data.size(), engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global())); &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
} }
PADDLE_ENFORCE(infer_engine_ != nullptr, PADDLE_ENFORCE_NOT_NULL(
"build cuda engine failed when deserialize engine info.!"); infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
} }
void SetRuntimeBatch(size_t batch_size); void SetRuntimeBatch(size_t batch_size);
......
...@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data, ...@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
nvinfer1::Dims ElementWisePlugin::getOutputDimensions( nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims *input_dims, int num_inputs) { int index, const nvinfer1::Dims *input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0); PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(num_inputs, 2); "There is only one output in TRT elementwise "
PADDLE_ENFORCE_NOT_NULL(input_dims); "op plugin, but got output index: %d.",
index));
PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
"There are 2 inputs in TRT elementwise "
"op plugin, but got input number: %d.",
num_inputs));
PADDLE_ENFORCE_NOT_NULL(
input_dims,
platform::errors::InvalidArgument(
"The input dims of TRT elementwise op plugin should not be null."));
return input_dims[0]; return input_dims[0];
} }
int ElementWisePlugin::initialize() { int ElementWisePlugin::initialize() {
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0); PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
platform::errors::InvalidArgument(
"The dimension of input Y of TRT elementwise op plugin "
"should be greater than 0, but got %d.",
dims_y_.nbDims));
axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
int trimed_nb_dims = dims_y_.nbDims; int trimed_nb_dims = dims_y_.nbDims;
...@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() { ...@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
} }
dims_y_.nbDims = trimed_nb_dims; dims_y_.nbDims = trimed_nb_dims;
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_); PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims); platform::errors::InvalidArgument(
"We expect [number of x dims] >= [number of y dims + "
"axis] in TRT elementwise op plugin, but got [number "
"of x dims] = %d, [number of y dims + axis] = %d.",
dims_x_.nbDims, dims_y_.nbDims + axis_));
PADDLE_ENFORCE_LT(
axis_, dims_x_.nbDims,
platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
"in TRT elementwise op plugin, but got "
"[axis] = %d, [number of x dims] = %d.",
axis_, dims_x_.nbDims));
prev_size_ = 1; prev_size_ = 1;
midd_size_ = 1; midd_size_ = 1;
...@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() { ...@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
for (int i = 0; i < dims_y_.nbDims; ++i) { for (int i = 0; i < dims_y_.nbDims; ++i) {
PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i], PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
"Broadcast dimension mismatch."); platform::errors::InvalidArgument(
"Broadcast dimension mismatch. The dims of input Y "
"should be a subsequence of X."));
midd_size_ *= dims_y_.d[i]; midd_size_ *= dims_y_.d[i];
} }
...@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue( ...@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
elementwise_kernel<<<block, thread, 0, stream>>>( elementwise_kernel<<<block, thread, 0, stream>>>(
num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>()); num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
} else { } else {
PADDLE_THROW("Not implemented."); PADDLE_THROW(platform::errors::Unimplemented(
"Paddle-TRT only support elementwise operation: {add, mul} currently, "
"but got %s.",
type_));
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
......
...@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
nvinfer1::DimsCHW{1, 1, 1}); nvinfer1::DimsCHW{1, 1, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y"); engine_->DeclareOutput(fc_layer, 0, "y");
LOG(INFO) << "freeze network"; LOG(INFO) << "freeze network";
...@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
nvinfer1::DimsCHW{1, 2, 1}); nvinfer1::DimsCHW{1, 2, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y"); engine_->DeclareOutput(fc_layer, 0, "y");
engine_->FreezeNetwork(); engine_->FreezeNetwork();
...@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) { ...@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
auto *conv_layer = auto *conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(conv_layer,
platform::errors::InvalidArgument(
"TRT convolution layer building failed."));
conv_layer->setStride(nvinfer1::DimsHW{1, 1}); conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
...@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) { ...@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
nvinfer1::DimsHW{2, 2}); nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr); PADDLE_ENFORCE_NOT_NULL(
pool_layer,
platform::errors::InvalidArgument("TRT pooling layer building failed."));
pool_layer->setStride(nvinfer1::DimsHW{1, 1}); pool_layer->setStride(nvinfer1::DimsHW{1, 1});
pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
......
...@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch( ...@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
engine_name_, it.first)); engine_name_, it.first));
} }
const auto& d = dataptr->second; const auto& d = dataptr->second;
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
"Fail to cudaMemcpy %s for %s", engine_name_, it.first);
} }
data_is_set_ = true; data_is_set_ = true;
......
...@@ -342,9 +342,9 @@ if(WITH_MKLDNN) ...@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
### Lexcial analysis GRU model ### Lexcial analysis GRU model
set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru") set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz") download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz") download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin") set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model") set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis") set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc") set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
...@@ -363,9 +363,12 @@ if(WITH_MKLDNN) ...@@ -363,9 +363,12 @@ if(WITH_MKLDNN)
inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC}) inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
# MobileNetV1 FP32 vs. Quant INT8 # MobileNetV1 FP32 vs. Quant INT8
# The FP32 model should already be downloaded for slim Quant unit tests on Linux
set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2") set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8") set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz") if(NOT LINUX)
download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
endif(NOT LINUX)
download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz") download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH}) inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
......
...@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
std::vector<PaddleTensor> inputs; std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
......
...@@ -45,7 +45,9 @@ endif() ...@@ -45,7 +45,9 @@ endif()
SET(OP_HEADER_DEPS xxhash executor) SET(OP_HEADER_DEPS xxhash executor)
if (WITH_GPU) if (WITH_GPU)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
endif()
endif() endif()
SET(OP_PREFETCH_DEPS "") SET(OP_PREFETCH_DEPS "")
......
...@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/framework/tensor_util.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel { class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
public: public:
AmpCheckFiniteAndScaleOp(const std::string &type, CheckFiniteAndUnscaleOp(const std::string& type,
const framework::VariableNameMap &inputs, const framework::VariableNameMap& inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap &attrs) const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
"amp_check_finite_and_unscale"); "check_finite_and_unscale");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"amp_check_finite_and_unscale"); "check_finite_and_unscale");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->Inputs("X").size(), ctx->Outputs("Out").size(), ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The input(X) and output(Out) should have same size in " "The input(X) and output(Out) should have same size in "
"Operator(amp_check_finite_and_unscale), size of input(X) is %d " "Operator(check_finite_and_unscale), size of input(X) is %d "
"and size of output(Out) is %d.", "and size of output(Out) is %d.",
ctx->Inputs("X").size(), ctx->Outputs("Out").size())); ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
...@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel { ...@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
} }
}; };
class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker { class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput( AddInput(
"X", "X",
"(Tensors) The input tensors of amp_check_finite_and_scale operator.") "(Tensors) The input tensors of check_finite_and_unscale operator.")
.AsDuplicable(); .AsDuplicable();
AddInput("Scale", AddInput("Scale",
"(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale " "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator."); "operator.");
AddOutput("Out", AddOutput("Out",
"(Tensors) The scaled output tensor of " "(Tensors) The scaled output tensor of "
"amp_check_finite_and_unscale operator.") "check_finite_and_unscale operator.")
.AsDuplicable(); .AsDuplicable();
AddOutput("FoundInfinite", AddOutput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates " "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"if there there is infinite or nan item in input X."); "if there there is infinite or nan item in input X.");
AddComment(R"DOC( AddComment(R"DOC(
amp_check_finite_and_scale operator. check_finite_and_unscale operator.
Check if input X contains all finite data, if yes, scale it by input Scale. Check if input X contains all finite data, if yes, scale it by input Scale.
$$Out = X * scale$$ $$Out = X / scale$$
If any tensor in X contains Inf or Nan, the Out will generate a indicator. If any tensor in X contains Inf or Nan, the Out will generate a indicator.
FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
...@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False). ...@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False).
} }
}; };
template <typename T>
class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
T inverse_scale = Inverse<T>(*scale_data);
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
*found_inf_data = !(*is_finite_data);
}
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
if (!(*found_inf_data)) {
eigen_out.device(dev) = eigen_in * inverse_scale;
} else {
eigen_out.device(dev) = eigen_in * static_cast<T>(0);
}
}
return;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR( REGISTER_OPERATOR(
amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp, check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
ops::AmpCheckFiniteAndScaleOpMaker, ops::CheckFiniteAndUnscaleOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
amp_check_finite_and_scale, ops::CheckFiniteAndUnscaleCpuKernel<float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext, ops::CheckFiniteAndUnscaleCpuKernel<double>);
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
double>);
...@@ -14,28 +14,31 @@ limitations under the License. */ ...@@ -14,28 +14,31 @@ limitations under the License. */
#include <cuda.h> #include <cuda.h>
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num, __global__ void GpuInverse(const T* s, T* o) {
bool* found_inf, T* out) { *o = Inverse<T>(*s);
}
template <typename T>
__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
bool* found_inf, T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) { if (idx < num) {
if (!isfinite(in[idx])) { if (!isfinite(in[idx])) {
*found_inf = 1; *found_inf = true;
} }
out[idx] = *found_inf ? in[idx] : in[idx] * scale[0]; out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
} }
} }
template <typename T> template <typename T>
class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
: public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
...@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> ...@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace()); bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool)); cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
framework::Tensor inverse_scale =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
T* inverse_scale_v = inverse_scale.template data<T>();
GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
for (size_t i = 0; i < xs.size(); ++i) { for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i]; const auto* x = xs[i];
auto* out = outs[i]; auto* out = outs[i];
...@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> ...@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace()); T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = x->numel(); int num = x->numel();
int block = 512; int block = 1024;
int grid = (num + block - 1) / block; int grid = (num + block - 1) / block;
VLOG(3) << "launch kernel"; VLOG(3) << "launch kernel";
AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>( CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, scale_data, num, found_inf_data, out_data); x_data, inverse_scale_v, num, found_inf_data, out_data);
VLOG(3) << "finish kernel"; VLOG(3) << "finish kernel";
} }
} }
...@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T> ...@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
amp_check_finite_and_scale, ops::CheckFiniteAndUnscaleGpuKernel<float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext, ops::CheckFiniteAndUnscaleGpuKernel<double>);
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
double>);
...@@ -16,51 +16,16 @@ limitations under the License. */ ...@@ -16,51 +16,16 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/isfinite_op.h" #include "paddle/fluid/operators/isfinite_op.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename T>
class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> { inline HOSTDEVICE T Inverse(T s) {
public: return 1.0 / s;
void Compute(const framework::ExecutionContext& ctx) const { }
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
if (*is_finite_data) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
eigen_out.device(dev) = (*scale_data) * eigen_in;
} else {
*found_inf_data = true;
break;
}
}
}
return;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UpdateLossScalingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
"update_loss_scaling");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->SetOutputDim("LossScaling", {1});
ctx->SetOutputDim("OutGoodSteps", {1});
ctx->SetOutputDim("OutBadSteps", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
ctx.device_context());
}
};
class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensors) The input tensors of update_loss_scaling operator.")
.AsDuplicable();
AddInput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"whether there is any infinite gradient.");
AddInput("PrevLossScaling",
"(Tensor) 1-dim tensor, previous loss scaling.");
AddInput("InGoodSteps",
"(Tensor) 1-dim tensor, accumulates good steps in which all "
"gradients are finite.");
AddInput("InBadSteps",
"(Tensor) 1-dim tensor, accumulates bad steps in which some "
"gradients are infinite.");
AddOutput("Out",
"(Tensors) The output tensor of update_loss_scaling operator.")
.AsDuplicable();
AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
AddAttr<int>("incr_every_n_steps",
"A value represents increasing loss scaling every n "
"consecutive steps with finite gradients.");
AddAttr<int>("decr_every_n_nan_or_inf",
"A value represents decreasing loss scaling every n "
"accumulated steps with nan or inf gradients.");
AddAttr<float>("incr_ratio",
"The multiplier to use when increasing the loss scaling.")
.AddCustomChecker([](float incr_ratio) {
PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be greater than 1, but "
"the received is %f",
incr_ratio));
});
AddAttr<float>(
"decr_ratio",
"The less-than-one-multiplier to use when decreasing loss scaling.")
.AddCustomChecker([](float decr_ratio) {
PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be between 0 and 1, but "
"the received is %f",
decr_ratio));
});
AddComment(R"DOC(
Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite.
)DOC");
}
};
template <typename T>
class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data,
bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
if (*found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
std::memset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(
update_loss_scaling, ops::UpdateLossScalingOp,
ops::UpdateLossScalingOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<CPU, float>,
ops::UpdateLossScalingKernel<CPU, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void GpuUpdateLossScaling(
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
template <typename T>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
bool has_inf{false};
memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
sizeof(bool), dev_ctx.stream());
if (has_inf) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
cudaMemset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using GPU = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<GPU, float>,
ops::UpdateLossScalingKernel<GPU, double>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
HOSTDEVICE void Update(const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data,
int* bad_out_data) {
if (*found_inf_data) {
*good_out_data = 0;
*bad_out_data = *bad_in_data + 1;
if (*bad_out_data == decr_every_n_nan_or_inf) {
T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
*updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
? static_cast<T>(1)
: new_loss_scaling;
*bad_out_data = 0;
}
} else {
*bad_out_data = 0;
*good_out_data = *good_in_data + 1;
if (*good_out_data == incr_every_n_steps) {
T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
*updated_loss_scaling_data = std::isfinite(new_loss_scaling)
? new_loss_scaling
: *pre_loss_scaling_data;
*good_out_data = 0;
}
}
}
template <typename DeviceContext, typename T>
class UpdateLossScalingFunctor {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const;
};
template <typename DeviceContext, typename T>
class LazyZeroInputs {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const;
};
template <typename DeviceContext, typename T>
class UpdateLossScalingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
platform::errors::InvalidArgument(
"FoundInfinite must has only one element."));
const bool* found_inf_data = found_inf->data<bool>();
const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
const int* good_in_data = good_in->data<int>();
const int* bad_in_data = bad_in->data<int>();
auto& dev_ctx = ctx.template device_context<DeviceContext>();
T* updated_loss_scaling_data =
updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
const int decr_every_n_nan_or_inf =
ctx.Attr<int>("decr_every_n_nan_or_inf");
const float incr_ratio = ctx.Attr<float>("incr_ratio");
const float decr_ratio = ctx.Attr<float>("decr_ratio");
UpdateLossScalingFunctor<DeviceContext, T>{}(
dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
}
};
} // namespace operators
} // namespace paddle
...@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place // CompareOp kernel's device type is decided by input tensor place
bool force_cpu = ctx.Attr<bool>("force_cpu"); bool force_cpu = ctx.Attr<bool>("force_cpu");
kt.place_ = force_cpu ? platform::CPUPlace() if (force_cpu) {
: ctx.Input<framework::LoDTensor>("X")->place(); kt.place_ = platform::CPUPlace();
} else {
if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
typeid(platform::CUDAPinnedPlace)) {
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
} else {
kt.place_ = ctx.GetPlace();
}
}
return kt; return kt;
} }
}; };
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
inline std::vector<int64_t> CorrelationOutputSize(int batch, int input_height,
int input_width, int stride1,
int stride2, int kernel_size,
int pad_size,
int max_displacement) {
std::vector<int64_t> output_shape({batch});
int kernel_radius = (kernel_size - 1) / 2;
int border_radius = kernel_radius + max_displacement;
int padded_input_height = input_height + 2 * pad_size;
int padded_input_width = input_width + 2 * pad_size;
int output_channel = ((max_displacement / stride2) * 2 + 1) *
((max_displacement / stride2) * 2 + 1);
output_shape.push_back(output_channel);
int output_height =
std::ceil(static_cast<float>(padded_input_height - 2 * border_radius) /
static_cast<float>(stride1));
int output_width =
std::ceil(static_cast<float>(padded_input_width - 2 * border_radius) /
static_cast<float>(stride1));
output_shape.push_back(output_height);
output_shape.push_back(output_width);
return output_shape;
}
class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input1", "Input is a 4-D Tensor with shape [N, C, H, W]");
AddInput("Input2", "Input is a 4-D Tensor with shape [N, C, H, W]");
AddOutput("Output",
"(Tensor) The output tensor of correlation operator. "
"It has same data fromat and data type as the Input.");
AddAttr<int>("pad_size", "pad size for input1 and input2");
AddAttr<int>("kernel_size", "kernel size of input1 and input2");
AddAttr<int>("max_displacement", "max displacement of input1 and input2");
AddAttr<int>("stride1", "Input1 stride");
AddAttr<int>("stride2", "Input2 stride");
AddAttr<int>("corr_type_multiply", "correlation coefficient").SetDefault(1);
AddComment(
R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC");
}
};
class CorrelationOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
int stride1 = ctx->Attrs().Get<int>("stride1");
int stride2 = ctx->Attrs().Get<int>("stride2");
int max_displacement = ctx->Attrs().Get<int>("max_displacement");
int pad_size = ctx->Attrs().Get<int>("pad_size");
int kernel_size = ctx->Attrs().Get<int>("kernel_size");
auto in_dims = ctx->GetInputDim("Input1");
auto in2_dims = ctx->GetInputDim("Input2");
PADDLE_ENFORCE_EQ(in_dims.size() == 4, true,
platform::errors::InvalidArgument(
"Input(X) of CorrelationOp must be 4 dims."
"But received dims is %d.",
in_dims.size()));
PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true,
platform::errors::InvalidArgument(
"Input(Y) of CorrelationOp must be 4 dims."
"But received dims is %d.",
in2_dims.size()));
std::vector<int64_t> output_shape =
CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1,
stride2, kernel_size, pad_size, max_displacement);
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
PADDLE_ENFORCE_EQ(input_data_type, ctx.Input<Tensor>("Input2")->type(),
platform::errors::InvalidArgument(
"X and Y shoule have the same datatype"));
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
};
template <typename T>
class CorrelationOpGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("correlation_grad");
op->SetInput("Input1", this->Input("Input1"));
op->SetInput("Input2", this->Input("Input2"));
op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1"));
op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2"));
op->SetAttrMap(this->Attrs());
}
};
class CorrelationOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Output")), "Input",
"Output@GRAD", "CorrelationGradOp");
auto in1_dims = ctx->GetInputDim("Input1");
auto in2_dims = ctx->GetInputDim("Input2");
ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims);
ctx->SetOutputDim(framework::GradVarName("Input2"), in2_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace());
}
};
template <typename T>
class CorrelationKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::Unimplemented("Correlation only supports GPU now."));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker,
ops::CorrelationOpGradMaker<paddle::framework::OpDesc>,
ops::CorrelationOpGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad);
REGISTER_OP_CPU_KERNEL(correlation, ops::CorrelationKernel<float>,
ops::CorrelationKernel<double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
#define THREADS_PER_BLOCK 32
#define FULL_MASK 0xffffffff
using framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T>
__forceinline__ __device__ T warpReduceSum(T val) {
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down_sync(FULL_MASK, val, offset);
}
return val;
}
template <typename T>
__forceinline__ __device__ T blockReduceSum(T val) {
static __shared__ T shared[32];
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val);
if (lane == 0) shared[wid] = val;
__syncthreads();
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid == 0) val = warpReduceSum(val);
return val;
}
template <typename T>
__global__ void set_zero(T *x, int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x)
x[i] = static_cast<T>(0);
}
template <typename T>
__global__ void channel_first(const T *input, T *rinput, const int channel,
const int height, const int width,
const int pad_size) {
int n = blockIdx.x;
int h = blockIdx.y;
int w = blockIdx.z;
int ch_off = threadIdx.x;
T value;
int dimchw = channel * height * width;
int dimhw = height * width;
int p_dimw = (width + 2 * pad_size);
int p_dimh = (height + 2 * pad_size);
int p_dimchw = channel * p_dimw * p_dimh;
int p_dimcw = channel * p_dimw;
for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) {
value = input[n * dimchw + c * dimhw + h * width + w];
rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel +
c] = value;
}
}
template <typename T>
__global__ void correlation_forward(
T *output, const int output_channel, const int output_height,
const int output_width, const T *rinput1, const int input_channel,
const int input_height, const int input_width, const T *rinput2,
const int pad_size, const int kernel_size, const int max_displacement,
const int stride1, const int stride2) {
int p_input_width = input_width + 2 * pad_size;
int p_input_height = input_height + 2 * pad_size;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int n = blockIdx.x;
int h1 = blockIdx.y * stride1 + max_displacement;
int w1 = blockIdx.z * stride1 + max_displacement;
int c = threadIdx.x;
int p_dimchw = p_input_height * p_input_width * input_channel;
int p_dimcw = p_input_width * input_channel;
int p_dimc = input_channel;
int t_dimchw = output_channel * output_height * output_width;
int t_dimhw = output_height * output_width;
int t_dimw = output_width;
int nelems = kernel_size * kernel_size * p_dimc;
for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
int w2 = w1 + ti * stride2;
int h2 = h1 + tj * stride2;
T acc0 = 0;
for (int j = -kernel_rad; j <= kernel_rad; ++j) {
for (int i = -kernel_rad; i <= kernel_rad; ++i) {
for (int ch = c; ch < p_dimc; ch += blockDim.x) {
int index1 =
n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch;
int index2 =
n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch;
acc0 += static_cast<T>(rinput1[index1] * rinput2[index2]);
}
}
}
if (blockDim.x == warpSize) {
__syncwarp();
acc0 = warpReduceSum(acc0);
} else {
__syncthreads();
acc0 = blockReduceSum(acc0);
}
if (threadIdx.x == 0) {
int tc = (tj + displacement_rad) * displacement_size +
(ti + displacement_rad);
const int t_index =
n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z;
output[t_index] = static_cast<T>(acc0 / nelems);
}
}
}
}
// class CorrelationKernel<platform::CUDADeviceContext, T>
template <typename T>
class CorrelationCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"Correlation only supports GPU now."));
auto *input1 = ctx.Input<Tensor>("Input1");
auto *input2 = ctx.Input<Tensor>("Input2");
int pad_size = ctx.Attr<int>("pad_size");
int kernel_size = ctx.Attr<int>("kernel_size");
int stride1 = ctx.Attr<int>("stride1");
int stride2 = ctx.Attr<int>("stride2");
int max_displacement = ctx.Attr<int>("max_displacement");
int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
auto *output = ctx.Output<Tensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
// base on input1, NCHW
auto in_dims = input1->dims();
int N = in_dims[0];
int C = in_dims[1];
int H = in_dims[2];
int W = in_dims[3];
int padded_input_height = H + 2 * pad_size;
int padded_input_width = W + 2 * pad_size;
Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
{N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput1.mutable_data<T>(ctx.GetPlace());
Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
{N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput2.mutable_data<T>(ctx.GetPlace());
set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
rinput1.data<T>(), rinput1.numel());
set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
rinput2.data<T>(), rinput2.numel());
set_zero<<<(output->numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
output->data<T>(), output->numel());
auto out_dims = output->dims();
int OC = out_dims[1];
int OH = out_dims[2];
int OW = out_dims[3];
dim3 blocks_grid(N, H, W);
dim3 threads_block(THREADS_PER_BLOCK);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
dim3 threadsPerBlock(THREADS_PER_BLOCK);
dim3 totalBlocksCorr(N, OH, OW);
correlation_forward<
T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
stride2);
}
};
template <typename T>
__global__ void correlation_backward_input1(
int item, T *grad_input1, const int input_channel, const int input_height,
const int input_width, const T *grad_output, const int output_channel,
const int output_height, const int output_width, const T *rinput2,
const int pad_size, const int kernel_size, const int max_displacement,
const int stride1, const int stride2) {
int n = item;
int h = blockIdx.x * stride1 + pad_size;
int w = blockIdx.y * stride1 + pad_size;
int c = blockIdx.z;
int tch_off = threadIdx.x;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int xmin = (w - kernel_rad - max_displacement) / stride1;
int ymin = (h - kernel_rad - max_displacement) / stride1;
int xmax = (w + kernel_rad - max_displacement) / stride1;
int ymax = (h + kernel_rad - max_displacement) / stride1;
if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
return;
}
if (xmin > xmax || ymin > ymax) {
return;
}
xmin = max(0, xmin);
xmax = min(output_width - 1, xmax);
ymin = max(0, ymin);
ymax = min(output_height - 1, ymax);
int p_input_width = input_width + 2 * pad_size;
int p_input_height = input_height + 2 * pad_size;
int p_dimchw = input_channel * p_input_height * p_input_width;
int p_dimcw = input_channel * p_input_width;
int p_dimc = input_channel;
int t_dimchw = output_channel * output_height * output_width;
int t_dimhw = output_height * output_width;
int t_dimw = output_width;
int o_dimchw = input_channel * input_height * input_width;
int o_dimhw = input_height * input_width;
int o_dimw = input_width;
int nelems = kernel_size * kernel_size * input_channel;
__shared__ T prod_sum[THREADS_PER_BLOCK];
prod_sum[tch_off] = 0;
for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
int i2 = (tc % displacement_size - displacement_rad) * stride2;
int j2 = (tc / displacement_size - displacement_rad) * stride2;
int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c;
T val2 = rinput2[index2];
for (int j = ymin; j <= ymax; ++j) {
for (int i = xmin; i <= xmax; ++i) {
int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
prod_sum[tch_off] += grad_output[t_index] * val2;
}
}
}
__syncthreads();
if (tch_off == 0) {
T reduce_sum = 0;
for (int index = 0; index < THREADS_PER_BLOCK; index++) {
reduce_sum += prod_sum[index];
}
const int index1 =
n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
grad_input1[index1] = static_cast<T>(reduce_sum / nelems);
}
}
template <typename T>
__global__ void correlation_backward_input2(
int item, T *grad_input2, const int input_channel, const int input_height,
const int input_width, const T *grad_output, const int output_channel,
const int output_height, const int output_width, const T *rinput1,
const int pad_size, const int kernel_size, const int max_displacement,
const int stride1, const int stride2) {
int n = item;
int h = blockIdx.x * stride1 + pad_size;
int w = blockIdx.y * stride1 + pad_size;
int c = blockIdx.z;
int tch_off = threadIdx.x;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int p_input_width = input_width + 2 * pad_size;
int p_input_height = input_height + 2 * pad_size;
int p_dimchw = input_channel * p_input_height * p_input_width;
int p_dimcw = input_channel * p_input_width;
int p_dimc = input_channel;
int t_dimchw = output_channel * output_height * output_width;
int t_dimhw = output_height * output_width;
int t_dimw = output_width;
int o_dimchw = input_channel * input_height * input_width;
int o_dimhw = input_height * input_width;
int o_dimw = input_width;
int nelems = kernel_size * kernel_size * input_channel;
__shared__ T prod_sum[THREADS_PER_BLOCK];
prod_sum[tch_off] = 0;
for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
int i2 = (tc % displacement_size - displacement_rad) * stride2;
int j2 = (tc / displacement_size - displacement_rad) * stride2;
int xmin = (w - kernel_rad - max_displacement - i2) / stride1;
int ymin = (h - kernel_rad - max_displacement - j2) / stride1;
int xmax = (w + kernel_rad - max_displacement - i2) / stride1;
int ymax = (h + kernel_rad - max_displacement - j2) / stride1;
if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
continue;
}
if (xmin > xmax || ymin > ymax) {
continue;
}
xmin = max(0, xmin);
xmax = min(output_width - 1, xmax);
ymin = max(0, ymin);
ymax = min(output_height - 1, ymax);
int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c;
T val1 = rinput1[index1];
for (int j = ymin; j <= ymax; ++j) {
for (int i = xmin; i <= xmax; ++i) {
int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
prod_sum[tch_off] += grad_output[t_index] * val1;
}
}
}
__syncthreads();
if (tch_off == 0) {
T reduce_sum = 0;
for (int index = 0; index < THREADS_PER_BLOCK; index++) {
reduce_sum += prod_sum[index];
}
const int index2 =
n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
grad_input2[index2] = static_cast<T>(reduce_sum / nelems);
}
}
template <typename T>
class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"Correlation only supports GPU now."));
const auto *input1 = ctx.Input<Tensor>("Input1");
const auto *input2 = ctx.Input<Tensor>("Input2");
const auto *grad_output =
ctx.Input<Tensor>(framework::GradVarName("Output"));
const int pad_size = ctx.Attr<int>("pad_size");
const int kernel_size = ctx.Attr<int>("kernel_size");
const int stride1 = ctx.Attr<int>("stride1");
const int stride2 = ctx.Attr<int>("stride2");
const int max_displacement = ctx.Attr<int>("max_displacement");
const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
grad_input1->mutable_data<T>(ctx.GetPlace());
auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
grad_input2->mutable_data<T>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto in_dims = input1->dims();
int N = in_dims[0];
int C = in_dims[1];
int H = in_dims[2];
int W = in_dims[3];
int padded_input_height = H + 2 * pad_size;
int padded_input_width = W + 2 * pad_size;
Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
{N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput1.mutable_data<T>(ctx.GetPlace());
Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
{N, padded_input_height, padded_input_width, C}, dev_ctx);
rinput2.mutable_data<T>(ctx.GetPlace());
set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
rinput1.data<T>(), rinput1.numel());
set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
rinput2.data<T>(), rinput2.numel());
set_zero<<<(grad_input1->numel() + 512 - 1) / 512, 512, 0,
dev_ctx.stream()>>>(grad_input1->data<T>(),
grad_input1->numel());
set_zero<<<(grad_input2->numel() + 512 - 1) / 512, 512, 0,
dev_ctx.stream()>>>(grad_input2->data<T>(),
grad_input2->numel());
auto grad_out_dims = grad_output->dims();
int GOC = grad_out_dims[1];
int GOH = grad_out_dims[2];
int GOW = grad_out_dims[3];
dim3 blocks_grid(N, H, W);
dim3 threads_block(THREADS_PER_BLOCK);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
dim3 threadsPerBlock(THREADS_PER_BLOCK);
dim3 totalBlocksCorr(H, W, C);
for (int n = 0; n < N; n++) {
correlation_backward_input1<
T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement,
stride1, stride2);
}
for (int n = 0; n < N; n++) {
correlation_backward_input2<
T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement,
stride1, stride2);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
ops::CorrelationCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
ops::CorrelationCUDAGradKernel<double>);
...@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo ...@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
if(WITH_GPU) if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) set(TMPDEPS memory)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub) set(TMPDEPS memory cub)
endif()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
else() else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc) detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License.*/ limitations under the License.*/
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -54,11 +55,14 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel { ...@@ -54,11 +55,14 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
score_dim[1])); score_dim[1]));
} }
context->SetOutputDim("FpnRois", {post_nms_topN, 4}); context->SetOutputDim("FpnRois", {post_nms_topN, 4});
if (context->HasOutput("RoisNum")) {
context->SetOutputDim("RoisNum", {-1});
}
if (!context->IsRuntime()) { // Runtime LoD infershape will be computed if (!context->IsRuntime()) { // Runtime LoD infershape will be computed
// in Kernel. // in Kernel.
context->ShareLoD("MultiLevelRois", "FpnRois"); context->ShareLoD("MultiLevelRois", "FpnRois");
} }
if (context->IsRuntime()) { if (context->IsRuntime() && !context->HasInputs("MultiLevelRoIsNum")) {
std::vector<framework::InferShapeVarPtr> roi_inputs = std::vector<framework::InferShapeVarPtr> roi_inputs =
context->GetInputVarPtrs("MultiLevelRois"); context->GetInputVarPtrs("MultiLevelRois");
std::vector<framework::InferShapeVarPtr> score_inputs = std::vector<framework::InferShapeVarPtr> score_inputs =
...@@ -99,7 +103,16 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -99,7 +103,16 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor) Multiple score LoDTensors from each level in shape" "(LoDTensor) Multiple score LoDTensors from each level in shape"
" (N, 1), N is the number of RoIs.") " (N, 1), N is the number of RoIs.")
.AsDuplicable(); .AsDuplicable();
AddInput(
"MultiLevelRoIsNum",
"(List of Tensor) The RoIs' number of each image on multiple levels."
"The number on each level has the shape of (N), N is the number of "
"images.")
.AsDuplicable()
.AsDispensable();
AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores"); AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores");
AddOutput("RoisNum", "(Tensor), Number of RoIs in each images.")
.AsDispensable();
AddAttr<int>("post_nms_topN", AddAttr<int>("post_nms_topN",
"Select post_nms_topN RoIs from" "Select post_nms_topN RoIs from"
" all images and all fpn layers"); " all images and all fpn layers");
...@@ -123,3 +136,14 @@ REGISTER_OPERATOR( ...@@ -123,3 +136,14 @@ REGISTER_OPERATOR(
REGISTER_OP_CPU_KERNEL(collect_fpn_proposals, REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
ops::CollectFpnProposalsOpKernel<float>, ops::CollectFpnProposalsOpKernel<float>,
ops::CollectFpnProposalsOpKernel<double>); ops::CollectFpnProposalsOpKernel<double>);
REGISTER_OP_VERSION(collect_fpn_proposals)
.AddCheckpoint(
R"ROC(
Upgrade collect_fpn_proposals add a new input
[MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewInput("MultiLevelRoIsNum",
"The RoIs' number of each image on multiple levels."
"The number on each level has the shape of (N), "
"N is the number of images.")
.NewOutput("RoisNum", "The number of RoIs in each image."));
...@@ -80,14 +80,27 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -80,14 +80,27 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
int lod_size; int lod_size;
auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
for (size_t i = 0; i < roi_ins.size(); ++i) { for (size_t i = 0; i < roi_ins.size(); ++i) {
auto roi_in = roi_ins[i]; auto roi_in = roi_ins[i];
auto score_in = score_ins[i]; auto score_in = score_ins[i];
auto roi_lod = roi_in->lod().back(); if (multi_rois_num.size() > 0) {
lod_size = roi_lod.size() - 1; framework::Tensor temp;
for (size_t n = 0; n < lod_size; ++n) { TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp);
for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) { const int* length_in = temp.data<int>();
roi_batch_id_data[index++] = n; lod_size = multi_rois_num[i]->numel();
for (size_t n = 0; n < lod_size; ++n) {
for (size_t j = 0; j < length_in[n]; ++j) {
roi_batch_id_data[index++] = n;
}
}
} else {
auto length_in = roi_in->lod().back();
lod_size = length_in.size() - 1;
for (size_t n = 0; n < lod_size; ++n) {
for (size_t j = length_in[n]; j < length_in[n + 1]; ++j) {
roi_batch_id_data[index++] = n;
}
} }
} }
...@@ -190,6 +203,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -190,6 +203,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
offset.emplace_back(offset.back() + length_lod_cpu[i]); offset.emplace_back(offset.back() + length_lod_cpu[i]);
} }
if (ctx.HasOutput("RoisNum")) {
auto* rois_num = ctx.Output<Tensor>("RoisNum");
int* rois_num_data = rois_num->mutable_data<int>({lod_size}, place);
memory::Copy(place, rois_num_data, place, length_lod_data,
lod_size * sizeof(int), dev_ctx.stream());
}
framework::LoD lod; framework::LoD lod;
lod.emplace_back(offset); lod.emplace_back(offset);
fpn_rois->set_lod(lod); fpn_rois->set_lod(lod);
......
...@@ -17,6 +17,7 @@ limitations under the License.*/ ...@@ -17,6 +17,7 @@ limitations under the License.*/
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <numeric>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -65,6 +66,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -65,6 +66,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
auto multi_layer_scores = auto multi_layer_scores =
context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores"); context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
int num_size = multi_rois_num.size();
auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois"); auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
...@@ -88,11 +91,21 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -88,11 +91,21 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
const int num_fpn_level = multi_layer_rois.size(); const int num_fpn_level = multi_layer_rois.size();
std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0); std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
for (int i = 0; i < num_fpn_level; ++i) { for (int i = 0; i < num_fpn_level; ++i) {
auto cur_rois_lod = multi_layer_rois[i]->lod().back(); int all_rois = 0;
integral_of_all_rois[i + 1] = if (num_size == 0) {
integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1]; auto cur_rois_lod = multi_layer_rois[i]->lod().back();
all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
} else {
const int* cur_rois_num = multi_rois_num[i]->data<int>();
all_rois = std::accumulate(
cur_rois_num, cur_rois_num + multi_rois_num[i]->numel(), 0);
}
integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
} }
const int batch_size = (num_size == 0)
? multi_layer_rois[0]->lod().back().size() - 1
: multi_rois_num[0]->numel();
// concatenate all fpn rois scores into a list // concatenate all fpn rois scores into a list
// create a vector to store all scores // create a vector to store all scores
std::vector<ScoreWithID<T>> scores_of_all_rois( std::vector<ScoreWithID<T>> scores_of_all_rois(
...@@ -100,11 +113,20 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -100,11 +113,20 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < num_fpn_level; ++i) { for (int i = 0; i < num_fpn_level; ++i) {
const T* cur_level_scores = multi_layer_scores[i]->data<T>(); const T* cur_level_scores = multi_layer_scores[i]->data<T>();
int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i]; int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
auto cur_scores_lod = multi_layer_scores[i]->lod().back();
int cur_batch_id = 0; int cur_batch_id = 0;
int pre_num = 0;
for (int j = 0; j < cur_level_num; ++j) { for (int j = 0; j < cur_level_num; ++j) {
if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) { if (num_size == 0) {
cur_batch_id++; auto cur_scores_lod = multi_layer_scores[i]->lod().back();
if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
cur_batch_id++;
}
} else {
const int* rois_num_data = multi_rois_num[i]->data<int>();
if (j >= pre_num + rois_num_data[cur_batch_id]) {
pre_num += rois_num_data[cur_batch_id];
cur_batch_id++;
}
} }
int cur_index = j + integral_of_all_rois[i]; int cur_index = j + integral_of_all_rois[i];
scores_of_all_rois[cur_index].score = cur_level_scores[j]; scores_of_all_rois[cur_index].score = cur_level_scores[j];
...@@ -134,6 +156,9 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -134,6 +156,9 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
T* fpn_rois_data = fpn_rois->data<T>(); T* fpn_rois_data = fpn_rois->data<T>();
std::vector<size_t> lod0(1, 0); std::vector<size_t> lod0(1, 0);
int cur_batch_id = 0; int cur_batch_id = 0;
std::vector<int64_t> num_per_batch;
int pre_idx = 0;
int cur_num = 0;
for (int i = 0; i < post_nms_topN; ++i) { for (int i = 0; i < post_nms_topN; ++i) {
int cur_fpn_level = scores_of_all_rois[i].level; int cur_fpn_level = scores_of_all_rois[i].level;
int cur_level_index = scores_of_all_rois[i].index; int cur_level_index = scores_of_all_rois[i].index;
...@@ -144,6 +169,18 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -144,6 +169,18 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
if (scores_of_all_rois[i].batch_id != cur_batch_id) { if (scores_of_all_rois[i].batch_id != cur_batch_id) {
cur_batch_id = scores_of_all_rois[i].batch_id; cur_batch_id = scores_of_all_rois[i].batch_id;
lod0.emplace_back(i); lod0.emplace_back(i);
cur_num = i - pre_idx;
pre_idx = i;
num_per_batch.emplace_back(cur_num);
}
}
num_per_batch.emplace_back(post_nms_topN - pre_idx);
if (context.HasOutput("RoisNum")) {
auto* rois_num = context.Output<Tensor>("RoisNum");
int* rois_num_data =
rois_num->mutable_data<int>({batch_size}, context.GetPlace());
for (int i = 0; i < batch_size; i++) {
rois_num_data[i] = num_per_batch[i];
} }
} }
lod0.emplace_back(post_nms_topN); lod0.emplace_back(post_nms_topN);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -48,6 +49,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel { ...@@ -48,6 +49,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputsDim("MultiFpnRois", outs_dims); ctx->SetOutputsDim("MultiFpnRois", outs_dims);
ctx->SetOutputDim("RestoreIndex", {-1, 1}); ctx->SetOutputDim("RestoreIndex", {-1, 1});
if (ctx->HasOutputs("MultiLevelRoIsNum")) {
std::vector<framework::DDim> outs_num_dims;
for (size_t i = 0; i < num_out_rois; ++i) {
outs_num_dims.push_back({-1});
}
ctx->SetOutputsDim("MultiLevelRoIsNum", outs_num_dims);
}
if (!ctx->IsRuntime()) { if (!ctx->IsRuntime()) {
for (size_t i = 0; i < num_out_rois; ++i) { for (size_t i = 0; i < num_out_rois; ++i) {
ctx->SetLoDLevel("MultiFpnRois", ctx->GetLoDLevel("FpnRois"), i); ctx->SetLoDLevel("MultiFpnRois", ctx->GetLoDLevel("FpnRois"), i);
...@@ -66,12 +75,22 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel { ...@@ -66,12 +75,22 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)"); AddInput("FpnRois", "(LoDTensor) The RoIs at all levels in shape (-1, 4)");
AddInput("RoisNum",
"(Tensor) The number of RoIs in shape (B),"
"B is the number of images")
.AsDispensable();
AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator") AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
.AsDuplicable(); .AsDuplicable();
AddOutput("RestoreIndex", AddOutput("RestoreIndex",
"(Tensor) An array of positive number which is " "(Tensor) An array of positive number which is "
"used to restore the order of FpnRois"); "used to restore the order of FpnRois");
AddOutput("MultiLevelRoIsNum",
"(List of Tensor) The RoIs' number of each image on multiple "
"levels. The number on each level has the shape of (B),"
"B is the number of images.")
.AsDuplicable()
.AsDispensable();
AddAttr<int>("min_level", AddAttr<int>("min_level",
"The lowest level of FPN layer where the" "The lowest level of FPN layer where the"
" proposals come from"); " proposals come from");
...@@ -105,3 +124,14 @@ REGISTER_OPERATOR( ...@@ -105,3 +124,14 @@ REGISTER_OPERATOR(
REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals, REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
ops::DistributeFpnProposalsOpKernel<float>, ops::DistributeFpnProposalsOpKernel<float>,
ops::DistributeFpnProposalsOpKernel<double>); ops::DistributeFpnProposalsOpKernel<double>);
REGISTER_OP_VERSION(distribute_fpn_proposals)
.AddCheckpoint(
R"ROC(
Upgrade distribute_fpn_proposals add a new input
[RoisNum] and add a new output [MultiLevelRoIsNum].)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewInput("RoIsNum", "The number of RoIs in each image.")
.NewOutput("MultiLevelRoisNum",
"The RoIs' number of each image on multiple "
"levels. The number on each level has the shape of (B),"
"B is the number of images."));
...@@ -76,12 +76,20 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -76,12 +76,20 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
int num_level = max_level - min_level + 1; int num_level = max_level - min_level + 1;
// check that the fpn_rois is not empty // check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ( if (!ctx.HasInput("RoisNum")) {
fpn_rois->lod().size(), 1UL, PADDLE_ENFORCE_EQ(
platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD" fpn_rois->lod().size(), 1UL,
"with one level")); platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
"with one level"));
}
auto fpn_rois_lod = fpn_rois->lod().back(); std::vector<size_t> fpn_rois_lod;
if (ctx.HasInput("RoisNum")) {
auto* rois_num = ctx.Input<Tensor>("RoisNum");
fpn_rois_lod = GetLodFromRoisNum(rois_num);
} else {
fpn_rois_lod = fpn_rois->lod().back();
}
int lod_size = fpn_rois_lod.size() - 1; int lod_size = fpn_rois_lod.size() - 1;
int roi_num = fpn_rois_lod[lod_size]; int roi_num = fpn_rois_lod[lod_size];
...@@ -154,6 +162,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -154,6 +162,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
restore_idx_data, roi_num); restore_idx_data, roi_num);
int start = 0; int start = 0;
auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
for (int i = 0; i < num_level; ++i) { for (int i = 0; i < num_level; ++i) {
Tensor sub_lod = sub_lod_list.Slice(i, i + 1); Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
int* sub_lod_data = sub_lod.data<int>(); int* sub_lod_data = sub_lod.data<int>();
...@@ -180,6 +190,11 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -180,6 +190,11 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim}, multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
dev_ctx.GetPlace()); dev_ctx.GetPlace());
} }
if (multi_rois_num.size() > 0) {
Tensor* rois_num_t = multi_rois_num[i];
TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
rois_num_t->Resize({lod_size});
}
framework::LoD lod; framework::LoD lod;
lod.emplace_back(offset); lod.emplace_back(offset);
multi_fpn_rois[i]->set_lod(lod); multi_fpn_rois[i]->set_lod(lod);
......
...@@ -28,6 +28,21 @@ namespace operators { ...@@ -28,6 +28,21 @@ namespace operators {
const int kBoxDim = 4; const int kBoxDim = 4;
inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
std::vector<size_t> rois_lod;
auto* rois_num_data = rois_num->data<int>();
Tensor cpu_tensor;
if (platform::is_gpu_place(rois_num->place())) {
TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor);
rois_num_data = cpu_tensor.data<int>();
}
rois_lod.push_back(static_cast<size_t>(0));
for (int i = 0; i < rois_num->numel(); ++i) {
rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
}
return rois_lod;
}
template <typename T> template <typename T>
static inline T BBoxArea(const T* box, bool normalized) { static inline T BBoxArea(const T* box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) { if (box[2] < box[0] || box[3] < box[1]) {
...@@ -65,13 +80,22 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -65,13 +80,22 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
const int num_level = max_level - min_level + 1; const int num_level = max_level - min_level + 1;
// check that the fpn_rois is not empty // check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ( if (!context.HasInput("RoisNum")) {
fpn_rois->lod().size(), 1UL, PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD " platform::errors::InvalidArgument(
"with one level.")); "DistributeFpnProposalsOp needs LoD "
"with one level."));
}
auto fpn_rois_lod = fpn_rois->lod().back(); std::vector<size_t> fpn_rois_lod;
int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1]; int fpn_rois_num;
if (context.HasInput("RoisNum")) {
auto* rois_num = context.Input<Tensor>("RoisNum");
fpn_rois_lod = GetLodFromRoisNum(rois_num);
} else {
fpn_rois_lod = fpn_rois->lod().back();
}
fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
std::vector<int> target_level; std::vector<int> target_level;
// std::vector<int> target_level(fpn_rois_num, -1); // std::vector<int> target_level(fpn_rois_num, -1);
// record the number of rois in each level // record the number of rois in each level
...@@ -136,6 +160,18 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -136,6 +160,18 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < fpn_rois_num; ++i) { for (int i = 0; i < fpn_rois_num; ++i) {
restore_index_data[restore_index_inter[i]] = i; restore_index_data[restore_index_inter[i]] = i;
} }
auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
if (multi_rois_num.size() > 0) {
int batch_size = fpn_rois_lod.size() - 1;
for (int i = 0; i < num_level; ++i) {
int* rois_num_data = multi_rois_num[i]->mutable_data<int>(
{batch_size}, context.GetPlace());
for (int j = 0; j < batch_size; ++j) {
rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
multi_fpn_rois_lod0[i][j]);
}
}
}
// merge lod information into LoDTensor // merge lod information into LoDTensor
for (int i = 0; i < num_level; ++i) { for (int i = 0; i < num_level; ++i) {
framework::LoD lod; framework::LoD lod;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -61,6 +62,10 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { ...@@ -61,6 +62,10 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("RpnRois", {-1, 4}); ctx->SetOutputDim("RpnRois", {-1, 4});
ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
if (!ctx->IsRuntime()) {
ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1));
ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1));
}
} }
protected: protected:
...@@ -347,7 +352,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -347,7 +352,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
lod0.push_back(0); lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4}); anchors.Resize({anchors.numel() / 4, 4});
variances.Resize({variances.numel() / 4, 4}); variances.Resize({variances.numel() / 4, 4});
std::vector<int64_t> tmp_lod; std::vector<int> tmp_num;
int64_t num_proposals = 0; int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) { for (int64_t i = 0; i < num; ++i) {
...@@ -369,16 +374,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -369,16 +374,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
AppendProposals(rpn_roi_probs, num_proposals, scores); AppendProposals(rpn_roi_probs, num_proposals, scores);
num_proposals += proposals.dims()[0]; num_proposals += proposals.dims()[0];
lod0.push_back(num_proposals); lod0.push_back(num_proposals);
tmp_lod.push_back(num_proposals); tmp_num.push_back(proposals.dims()[0]);
} }
if (context.HasOutput("RpnRoisLod")) { if (context.HasOutput("RpnRoisNum")) {
auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod"); auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace()); rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
int64_t *lod_data = rpn_rois_lod->data<int64_t>(); int *num_data = rpn_rois_num->data<int>();
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
lod_data[i] = tmp_lod[i]; num_data[i] = tmp_num[i];
} }
rpn_rois_lod->Resize({num}); rpn_rois_num->Resize({num});
} }
rpn_rois->set_lod(lod); rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod); rpn_roi_probs->set_lod(lod);
...@@ -433,6 +438,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -433,6 +438,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
Tensor keep; Tensor keep;
FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep); FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
// Handle the case when there is no keep index left
if (keep.numel() == 0) {
math::SetConstant<platform::CPUDeviceContext, T> set_zero;
bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
set_zero(ctx, &bbox_sel, static_cast<T>(0));
Tensor scores_filter;
scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
set_zero(ctx, &scores_filter, static_cast<T>(0));
return std::make_pair(bbox_sel, scores_filter);
}
Tensor scores_filter; Tensor scores_filter;
bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace()); bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
...@@ -481,7 +496,8 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -481,7 +496,8 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor), Output proposals with shape (rois_num, 4)."); "(LoDTensor), Output proposals with shape (rois_num, 4).");
AddOutput("RpnRoiProbs", AddOutput("RpnRoiProbs",
"(LoDTensor) Scores of proposals with shape (rois_num, 1)."); "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
AddOutput("RpnRoisLod", "(Tensor), rpn rois's lod info").AsDispensable(); AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image")
.AsDispensable();
AddAttr<int>("pre_nms_topN", AddAttr<int>("pre_nms_topN",
"Number of top scoring RPN proposals to keep before " "Number of top scoring RPN proposals to keep before "
"applying NMS."); "applying NMS.");
...@@ -515,3 +531,11 @@ REGISTER_OPERATOR( ...@@ -515,3 +531,11 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>, REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
ops::GenerateProposalsKernel<double>); ops::GenerateProposalsKernel<double>);
REGISTER_OP_VERSION(generate_proposals)
.AddCheckpoint(
R"ROC(
Upgrade generate_proposals add a new output [RpnRoisNum])ROC",
paddle::framework::compatible::OpVersionDesc().NewOutput(
"RpnRoisNum",
"The number of Rpn RoIs in each image. RpnRoisNum is "
"dispensable."));
...@@ -330,6 +330,15 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -330,6 +330,15 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
keep_index.Resize({keep_num}); keep_index.Resize({keep_num});
Tensor scores_filter, proposals_filter; Tensor scores_filter, proposals_filter;
// Handle the case when there is no keep index left
if (keep_num == 0) {
math::SetConstant<platform::CUDADeviceContext, T> set_zero;
proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
set_zero(ctx, &proposals_filter, static_cast<T>(0));
set_zero(ctx, &scores_filter, static_cast<T>(0));
return std::make_pair(proposals_filter, scores_filter);
}
proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace()); proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace()); scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
GPUGather<T>(ctx, proposals, keep_index, &proposals_filter); GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
...@@ -421,7 +430,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -421,7 +430,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
int64_t num_proposals = 0; int64_t num_proposals = 0;
std::vector<size_t> offset(1, 0); std::vector<size_t> offset(1, 0);
std::vector<int64_t> tmp_lod; std::vector<int> tmp_num;
for (int64_t i = 0; i < num; ++i) { for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1); Tensor im_info_slice = im_info->Slice(i, i + 1);
...@@ -448,15 +457,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -448,15 +457,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
dev_ctx.Wait(); dev_ctx.Wait();
num_proposals += proposals.dims()[0]; num_proposals += proposals.dims()[0];
offset.emplace_back(num_proposals); offset.emplace_back(num_proposals);
tmp_lod.push_back(num_proposals); tmp_num.push_back(proposals.dims()[0]);
} }
if (context.HasOutput("RpnRoisLod")) { if (context.HasOutput("RpnRoisNum")) {
auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod"); auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace()); rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
int64_t *lod_data = rpn_rois_lod->data<int64_t>(); int *num_data = rpn_rois_num->data<int>();
memory::Copy(place, lod_data, cpu_place, &tmp_lod[0], memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num,
sizeof(int64_t) * num, dev_ctx.stream()); dev_ctx.stream());
rpn_rois_lod->Resize({num}); rpn_rois_num->Resize({num});
} }
framework::LoD lod; framework::LoD lod;
lod.emplace_back(offset); lod.emplace_back(offset);
......
...@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) { ...@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
} else if (p == INFINITY || p == -INFINITY) { } else if (p == INFINITY || p == -INFINITY) {
// p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
// j!=i, or equals to sign(z_i) * dout if j=i. // j!=i, or equals to sign(z_i) * dout if j=i.
grad_t.device(place) = if (platform::is_cpu_place(context.GetPlace())) {
(x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() * grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
sign * out_grad_t.broadcast(out_bcast_dims); .template cast<T>() *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
.template cast<T>() *
sign * out_grad_t.broadcast(out_bcast_dims);
}
} else { } else {
// dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
grad_t.device(place) = if (platform::is_cpu_place(context.GetPlace())) {
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * grad_t.device(place) =
out_grad_t.broadcast(out_bcast_dims); (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
out_grad_t.broadcast(out_bcast_dims);
}
} }
Eigen::DSizes<int, Rank * 2> x_reshape_dims; Eigen::DSizes<int, Rank * 2> x_reshape_dims;
......
...@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp, ...@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_floordiv, elementwise_floordiv,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>, ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
int64_t>); int64_t>);
...@@ -19,7 +19,5 @@ namespace plat = paddle::platform; ...@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_floordiv, elementwise_floordiv,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>, ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>); ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
...@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx, ...@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) { const framework::Tensor *y, framework::Tensor *z) {
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims(); ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
auto y_dims = y->dims(); ctx, x, y, axis, FloorDivFunctor<T>(), z);
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
}
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp { ...@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
using mkldnn::memory; if (platform::CanMKLDNNBeUsed(ctx)) {
auto CanMKLDNNElementwiseMulBeUsed = [&]() {
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto y_dims = ctx.Input<Tensor>("Y")->dims();
int rankdiff = x_dims.size() - y_dims.size();
// TODO(jczaja): Remove this when oneDNN performance for scalar
// broadcasting
// is improved (Ernie large situation)
if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
return false;
}
return true;
};
if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(), return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN, framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN); framework::LibraryType::kMKLDNN);
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("ShapeTensor",
"(Tensor<int>), optional). The shape of the output."
"It has a higher priority than Attr(shape).")
.AsDispensable();
AddInput("ShapeTensorList",
"(vector<Tensor<int>>, optional). The shape of the output. "
"It has a higher priority than Attr(shape)."
"The shape of the element in vector must be [1].")
.AsDuplicable()
.AsDispensable();
AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output")
.SetDefault({});
AddAttr<int>("dtype", "The data type of output tensor, Default is float")
.SetDefault(framework::proto::VarType::FP32);
AddOutput("Out", "(Tensor) The output tensor.");
AddComment(R"DOC(empty operator
Returns a tensor filled with uninitialized data. The shape of the tensor is
defined by the variable argument shape.
The type of the tensor is specify by `dtype`.
)DOC");
}
};
class EmptyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* context) const override {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
if (context->HasInput("ShapeTensor")) {
auto dims = context->GetInputDim("ShapeTensor");
int num_ele = 1;
for (int i = 0; i < dims.size(); ++i) {
num_ele *= dims[i];
}
context->SetOutputDim("Out", framework::make_ddim({num_ele}));
} else if (context->HasInputs("ShapeTensorList")) {
std::vector<int> out_dims;
auto dims_list = context->GetInputsDim("ShapeTensorList");
for (size_t i = 0; i < dims_list.size(); ++i) {
auto& dims = dims_list[i];
PADDLE_ENFORCE_EQ(
dims, framework::make_ddim({1}),
"ShapeError: The shape of Tensor in list must be [1]. "
"But received the shape "
"is [%s]",
dims);
out_dims.push_back(dims[0]);
}
context->SetOutputDim("Out", framework::make_ddim(out_dims));
} else {
auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
context->SetOutputDim("Out", framework::make_ddim(shape));
}
}
protected:
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& context) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(context.Attr<int>("dtype")),
context.GetPlace());
}
};
class EmptyOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* context) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, context->GetAttr("dtype")));
context->SetOutputDataType("Out", data_type);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
ops::EmptyKernel<plat::CPUDeviceContext, int>,
ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
ops::EmptyKernel<plat::CPUDeviceContext, float>,
ops::EmptyKernel<plat::CPUDeviceContext, double>,
ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
ops::EmptyKernel<plat::CUDADeviceContext, int>,
ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
ops::EmptyKernel<plat::CUDADeviceContext, float>,
ops::EmptyKernel<plat::CUDADeviceContext, double>,
ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class EmptyKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto dtype = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
Tensor *out_tensor = context.Output<Tensor>("Out");
auto shape = GetShape(context);
out_tensor->Resize(shape);
out_tensor->mutable_data(context.GetPlace(), dtype);
}
};
} // namespace operators
} // namespace paddle
...@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
template <typename T>
class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_times_tensor")) {
op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
}
if (this->HasInput("ExpandTimes")) {
op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
}
op->SetAttrMap(this->Attrs());
op->SetType("expand");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
} // namespace operators } // namespace operators
...@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, ...@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
ops::ExpandGradOpMaker<paddle::framework::OpDesc>, ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandGradOpMaker<paddle::imperative::OpBase>); ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp, REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandGradNoNeedBufVarsInferer); ops::ExpandGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>, expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
template <typename T>
class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("expand_v2");
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_shapes_tensor")) {
op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
}
if (this->HasInput("Shape")) {
op->SetInput("Shape", this->Input("Shape"));
}
op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
} // namespace operators } // namespace operators
...@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker, ...@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>, ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>); ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp, REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandV2GradNoNeedBufVarsInferer); ops::ExpandV2GradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>, expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -27,27 +27,6 @@ namespace operators { ...@@ -27,27 +27,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
std::string op_type) {
// 1. shape is a Tensor
if (ctx.HasInput("ShapeTensor")) {
auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
auto vec_shape = GetDataFromTensor<int>(shape_tensor);
return framework::make_ddim(vec_shape);
}
// 2. shape is a list/tuple containing Tensor
auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
if (shape_tensor_list.size() > 0) {
auto vec_shape = GetDataFromTensorList(shape_tensor_list);
return framework::make_ddim(vec_shape);
}
// 3. shape is a list/tuple without containing Tensor
auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
return framework::make_ddim(vec_shape);
}
template <typename T> template <typename T>
class FillConstantKernel : public framework::OpKernel<T> { class FillConstantKernel : public framework::OpKernel<T> {
public: public:
...@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> { ...@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
} }
value = tensor_data[0]; value = tensor_data[0];
} }
const std::string op_type = "fill_constant"; auto shape = GetShape(ctx);
auto shape = GetShape(ctx, op_type);
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
......
...@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number); PADDLE_ENFORCE_LT(
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
...@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number); PADDLE_ENFORCE_LT(
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
......
...@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru"); OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru"); OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
platform::errors::InvalidArgument( ? framework::flatten_to_2d(x_dims, 1)
"Input(X)'s rank must be 2, but received input dim " : x_dims;
"size is:%d, input dim is:[%s]", PADDLE_ENFORCE_EQ(
x_dims.size(), x_dims)); x_mat_dims.size(), 2,
platform::errors::InvalidArgument("The size of input X dims should be 2, "
"or 3 with second dimension equal to "
"1, but now Input X dim is:[%s] ",
x_dims));
auto wx_dims = ctx->GetInputDim("WeightX"); auto wx_dims = ctx->GetInputDim("WeightX");
PADDLE_ENFORCE_EQ(wx_dims.size(), 2, PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
...@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"The rank of Input(WeightX) should be 2, but received " "The rank of Input(WeightX) should be 2, but received "
"WeightX dim size is:%d, WeightX dim is:[%s] ", "WeightX dim size is:%d, WeightX dim is:[%s] ",
wx_dims.size(), wx_dims)); wx_dims.size(), wx_dims));
PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], PADDLE_ENFORCE_EQ(
platform::errors::InvalidArgument( wx_dims[0], x_mat_dims[1],
"The first dimension of Input(WeightX) " platform::errors::InvalidArgument(
"should equal to second dimension of input x, but " "The first dimension of flattened WeightX"
"received WeightX dimension is:%d, x dimension is:%d", "should equal to last dimension of flattened input X, but "
wx_dims[0], x_dims[1])); "received fattened WeightX dimension is:%d, flattened X dimension "
"is:%d",
wx_dims[0], x_mat_dims[1]));
int frame_size = wx_dims[1] / 3; int frame_size = wx_dims[1] / 3;
auto wh_dims = ctx->GetInputDim("WeightH"); auto wh_dims = ctx->GetInputDim("WeightH");
...@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"received bias dim is:[%s], frame size is:%d", "received bias dim is:[%s], frame size is:%d",
b_dims, frame_size)); b_dims, frame_size));
} }
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_mat_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
int xx_width; int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) { if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1]; xx_width = wx_dims[1];
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0", OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
"fusion_gru"); "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput", OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
"fusion_gru"); "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut", OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
"fusion_gru"); "fusion_gru");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims); ctx->SetOutputDim("BatchedOut", out_dims);
} }
ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
ctx->ShareLoD("X", "XX"); ctx->ShareLoD("X", "XX");
} }
...@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() { ...@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "int8", "bfloat16"});
AddAttr<float>("Scale_data",
"Scale to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Shift_data",
"Shift to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(0.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
The Fusion complete GRU Operator. The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU, This operator fuse the fully-connected operator into GRU,
...@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
} }
} }
#define INIT_BASE_DEFINES \ #define INIT_BASE_DEFINES \
auto* x = ctx.Input<LoDTensor>("X"); \ auto* x = ctx.Input<LoDTensor>("X"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \ auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \ auto* xx = ctx.Output<LoDTensor>("XX"); \
auto x_lod = x->lod(); \ auto x_lod = x->lod(); \
auto x_dims = x->dims(); /* T x M*/ \ auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 3D*/ \ auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) \
const int total_T = x_dims[0]; \ ? framework::flatten_to_2d(x_dims, 1) \
: x_dims; \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_mat_dims[0]; \
const int D3 = wh_dims[1] const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \ #define INIT_OTHER_DEFINES \
...@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto* bias = ctx.Input<Tensor>("Bias"); \ auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \ auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \ bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \ const int M = x_mat_dims[1]; \
const int D = wh_dims[0]; \ const int D = wh_dims[0]; \
const int D2 = D * 2; \ const int D2 = D * 2; \
const jit::gru_attr_t attr( \ const jit::gru_attr_t attr( \
......
...@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std); std::normal_distribution<T> dist(mean, std);
const std::string op_type = "gaussian_random"; auto shape = GetShape(context);
auto shape = GetShape(context, op_type);
tensor->Resize(shape); tensor->Resize(shape);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
......
...@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
T mean = static_cast<T>(context.Attr<float>("mean")); T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std")); T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
const std::string op_type = "gaussian_random"; auto shape = GetShape(context);
auto shape = GetShape(context, op_type);
tensor->Resize(shape); tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
......
...@@ -115,7 +115,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -115,7 +115,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::string>( AddAttr<std::string>(
"padding_mode", "padding_mode",
"(bool, default true) The padding method used when source" "(bool, default true) The padding method used when source"
"index is out of input images. It can be 'zeros', 'reflect' and " "index is out of input images. It can be 'zeros', 'reflection' and "
"'border'.") "'border'.")
.SetDefault("zeros"); .SetDefault("zeros");
...@@ -174,6 +174,10 @@ class GridSampleOpGrad : public framework::OperatorWithKernel { ...@@ -174,6 +174,10 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
framework::GradVarName("X"), "grid_sampler");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Grid")), "Output",
framework::GradVarName("Grid"), "grid_sampler");
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
auto grid_dims = ctx->GetInputDim("Grid"); auto grid_dims = ctx->GetInputDim("Grid");
if (ctx->HasOutput(framework::GradVarName("X"))) { if (ctx->HasOutput(framework::GradVarName("X"))) {
......
...@@ -268,7 +268,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> { ...@@ -268,7 +268,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
Mode mode; Mode mode;
if (padding_mode_s == "border") { if (padding_mode_s == "border") {
padding_mode = PaddingMode::border; padding_mode = PaddingMode::border;
} else if (padding_mode_s == "reflect") { } else if (padding_mode_s == "reflection") {
padding_mode = PaddingMode::reflect; padding_mode = PaddingMode::reflect;
} else { } else {
padding_mode = PaddingMode::zeros; padding_mode = PaddingMode::zeros;
...@@ -432,7 +432,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -432,7 +432,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
Mode mode; Mode mode;
if (padding_mode_s == "border") { if (padding_mode_s == "border") {
padding_mode = PaddingMode::border; padding_mode = PaddingMode::border;
} else if (padding_mode_s == "reflect") { } else if (padding_mode_s == "reflection") {
padding_mode = PaddingMode::reflect; padding_mode = PaddingMode::reflect;
} else { } else {
padding_mode = PaddingMode::zeros; padding_mode = PaddingMode::zeros;
......
...@@ -76,7 +76,7 @@ static inline void clip(const platform::CPUDeviceContext& ctx, ...@@ -76,7 +76,7 @@ static inline void clip(const platform::CPUDeviceContext& ctx,
if (padding_mode == "border") { if (padding_mode == "border") {
grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0)) grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
.cwiseMin(static_cast<T>(max_val)); .cwiseMin(static_cast<T>(max_val));
} else if (padding_mode == "reflect") { } else if (padding_mode == "reflection") {
if (align_corners) { if (align_corners) {
auto double_range = static_cast<T>(max_val * 2); auto double_range = static_cast<T>(max_val * 2);
auto grid_abs = grid_slice_t.abs(); auto grid_abs = grid_slice_t.abs();
...@@ -117,7 +117,7 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx, ...@@ -117,7 +117,7 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
auto in_bound = (res == grid_slice_t); auto in_bound = (res == grid_slice_t);
grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>(); grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
grid_slice_t.device(place) = res; grid_slice_t.device(place) = res;
} else if (padding_mode == "reflect") { } else if (padding_mode == "reflection") {
if (align_corners) { if (align_corners) {
auto double_range = static_cast<T>(max_val * 2); auto double_range = static_cast<T>(max_val * 2);
auto is_neg = (grid_slice_t < static_cast<T>(0)); auto is_neg = (grid_slice_t < static_cast<T>(0));
......
...@@ -67,7 +67,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { ...@@ -67,7 +67,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
scale_tensor[0], 1, scale_tensor[0], 1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Scale's shape must be 1, but got shape = %d .", scale_tensor[0])); "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
// out_w = -1; out_w = -1;
} else { } else {
auto scale = ctx->Attrs().Get<std::vector<float>>("scale"); auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
if (scale.size() > 0) { if (scale.size() > 0) {
...@@ -159,8 +159,8 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { ...@@ -159,8 +159,8 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Scale's shape must be 2 or 1, but got shape = %d .", "Scale's shape must be 2 or 1, but got shape = %d .",
scale_tensor[0])); scale_tensor[0]));
// out_h = -1; out_h = -1;
// out_w = -1; out_w = -1;
} else { } else {
auto scale = ctx->Attrs().Get<std::vector<float>>("scale"); auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
if (scale.size() > 0) { if (scale.size() > 0) {
...@@ -264,9 +264,9 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { ...@@ -264,9 +264,9 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Scale's shape must be 3 or 1, but got shape = %d .", "Scale's shape must be 3 or 1, but got shape = %d .",
scale_tensor[0])); scale_tensor[0]));
// out_d = -1; out_d = -1;
// out_h = -1; out_h = -1;
// out_w = -1; out_w = -1;
} else { } else {
auto scale = ctx->Attrs().Get<std::vector<float>>("scale"); auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
if (scale.size() > 0) { if (scale.size() > 0) {
...@@ -633,6 +633,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer, ...@@ -633,6 +633,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
// interp_v2 support scale_factor whose input type is list, this operation is
// not
// compatible with interp_op, so a new one is added in paddle2.0
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op, REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
ops::InterpolateV2OpMaker, ops::InterpolateV2OpMaker,
......
...@@ -836,12 +836,12 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -836,12 +836,12 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor"); auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
float scale_w = -1;
if (list_new_shape_tensor.size() > 0) { if (list_new_shape_tensor.size() > 0) {
// have size tensor // have size tensor
auto new_size = get_new_shape(list_new_shape_tensor); auto new_size = get_new_shape(list_new_shape_tensor);
out_w = new_size[0]; out_w = new_size[0];
} else { } else {
float scale_w = -1;
auto scale_tensor = ctx.Input<Tensor>("Scale"); auto scale_tensor = ctx.Input<Tensor>("Scale");
auto scale = ctx.Attr<std::vector<float>>("scale"); auto scale = ctx.Attr<std::vector<float>>("scale");
if (scale_tensor != nullptr) { if (scale_tensor != nullptr) {
...@@ -887,8 +887,11 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -887,8 +887,11 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
float ratio_w = 0.f; float ratio_w = 0.f;
if (out_w > 1) { if (out_w > 1) {
float new_scale_w = 0.f;
new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
: static_cast<float>(in_w) / out_w;
ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0) ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
: static_cast<float>(in_w) / out_w; : static_cast<float>(new_scale_w);
} }
int in_cw = c * in_w; int in_cw = c * in_w;
...@@ -924,14 +927,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -924,14 +927,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor"); auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
float scale_w = -1;
float scale_h = -1;
if (list_new_shape_tensor.size() > 0) { if (list_new_shape_tensor.size() > 0) {
// have size tensor // have size tensor
auto new_size = get_new_shape(list_new_shape_tensor); auto new_size = get_new_shape(list_new_shape_tensor);
out_h = new_size[0]; out_h = new_size[0];
out_w = new_size[1]; out_w = new_size[1];
} else { } else {
float scale_h = -1;
float scale_w = -1;
auto scale_tensor = ctx.Input<Tensor>("Scale"); auto scale_tensor = ctx.Input<Tensor>("Scale");
auto scale = ctx.Attr<std::vector<float>>("scale"); auto scale = ctx.Attr<std::vector<float>>("scale");
if (scale_tensor != nullptr) { if (scale_tensor != nullptr) {
...@@ -993,12 +996,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -993,12 +996,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
float ratio_h = 0.f; float ratio_h = 0.f;
float ratio_w = 0.f; float ratio_w = 0.f;
if (out_h > 1) { if (out_h > 1) {
float new_scale_h = 0.f;
new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
: static_cast<float>(in_h) / out_h;
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1) ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h; : static_cast<float>(new_scale_h);
} }
if (out_w > 1) { if (out_w > 1) {
float new_scale_w = 0.f;
new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
: static_cast<float>(in_w) / out_w;
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1) ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w; : static_cast<float>(new_scale_w);
} }
int in_hw = in_h * in_w; int in_hw = in_h * in_w;
...@@ -1048,6 +1057,9 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -1048,6 +1057,9 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor"); auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
float scale_w = -1;
float scale_d = -1;
float scale_h = -1;
if (list_new_shape_tensor.size() > 0) { if (list_new_shape_tensor.size() > 0) {
// have size tensor // have size tensor
auto new_size = get_new_shape(list_new_shape_tensor); auto new_size = get_new_shape(list_new_shape_tensor);
...@@ -1055,9 +1067,6 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -1055,9 +1067,6 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
out_h = new_size[1]; out_h = new_size[1];
out_w = new_size[2]; out_w = new_size[2];
} else { } else {
float scale_d = -1;
float scale_h = -1;
float scale_w = -1;
auto scale_tensor = ctx.Input<Tensor>("Scale"); auto scale_tensor = ctx.Input<Tensor>("Scale");
auto scale = ctx.Attr<std::vector<float>>("scale"); auto scale = ctx.Attr<std::vector<float>>("scale");
if (scale_tensor != nullptr) { if (scale_tensor != nullptr) {
...@@ -1129,16 +1138,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, ...@@ -1129,16 +1138,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
float ratio_h = 0.f; float ratio_h = 0.f;
float ratio_w = 0.f; float ratio_w = 0.f;
if (out_d > 1) { if (out_d > 1) {
float new_scale_d = 0.f;
new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
: static_cast<float>(in_d) / out_d;
ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1) ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
: static_cast<float>(in_d) / out_d; : static_cast<float>(new_scale_d);
} }
if (out_h > 1) { if (out_h > 1) {
float new_scale_h = 0.f;
new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
: static_cast<float>(in_h) / out_h;
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1) ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h; : static_cast<float>(new_scale_h);
} }
if (out_w > 1) { if (out_w > 1) {
float new_scale_w = 0.f;
new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
: static_cast<float>(in_w) / out_w;
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1) ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w; : static_cast<float>(new_scale_w);
} }
int in_dhw = in_d * in_h * in_w; int in_dhw = in_d * in_h * in_w;
...@@ -1230,8 +1248,11 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, ...@@ -1230,8 +1248,11 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
float ratio_w = 0.f; float ratio_w = 0.f;
if (out_w > 1) { if (out_w > 1) {
float new_scale_w = 0.f;
new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
: static_cast<float>(in_w) / out_w;
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1) ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w; : static_cast<float>(new_scale_w);
} }
int in_cw = c * in_w; int in_cw = c * in_w;
int out_cw = c * out_w; int out_cw = c * out_w;
...@@ -1333,12 +1354,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ...@@ -1333,12 +1354,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
float ratio_h = 0.f; float ratio_h = 0.f;
float ratio_w = 0.f; float ratio_w = 0.f;
if (out_h > 1) { if (out_h > 1) {
float new_scale_h = 0.f;
new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
: static_cast<float>(in_h) / out_h;
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1) ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h; : static_cast<float>(new_scale_h);
} }
if (out_w > 1) { if (out_w > 1) {
float new_scale_w = 0.f;
new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
: static_cast<float>(in_w) / out_w;
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1) ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w; : static_cast<float>(new_scale_w);
} }
int in_hw = in_h * in_w; int in_hw = in_h * in_w;
...@@ -1464,16 +1491,25 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, ...@@ -1464,16 +1491,25 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
float ratio_h = 0.f; float ratio_h = 0.f;
float ratio_w = 0.f; float ratio_w = 0.f;
if (out_d > 1) { if (out_d > 1) {
float new_scale_d = 0.f;
new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
: static_cast<float>(in_d) / out_d;
ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1) ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
: static_cast<float>(in_d) / out_d; : static_cast<float>(new_scale_d);
} }
if (out_h > 1) { if (out_h > 1) {
float new_scale_h = 0.f;
new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
: static_cast<float>(in_h) / out_h;
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1) ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h; : static_cast<float>(new_scale_h);
} }
if (out_w > 1) { if (out_w > 1) {
float new_scale_w = 0.f;
new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
: static_cast<float>(in_w) / out_w;
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1) ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w; : static_cast<float>(new_scale_w);
} }
int in_dhw = in_d * in_h * in_w; int in_dhw = in_d * in_h * in_w;
......
...@@ -72,7 +72,11 @@ class KLDivLossKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,11 @@ class KLDivLossKernel : public framework::OpKernel<T> {
loss_t.device(place) = output; loss_t.device(place) = output;
} else if ("batchmean" == reduction) { } else if ("batchmean" == reduction) {
auto output_sum = output.sum(); auto output_sum = output.sum();
loss_t.device(place) = output_sum / output_sum.constant(n); if (n > 0) {
loss_t.device(place) = output_sum / output_sum.constant(n);
} else {
loss_t.device(place) = output_sum;
}
} else if ("mean" == reduction) { } else if ("mean" == reduction) {
loss_t.device(place) = output.mean(); loss_t.device(place) = output.mean();
} else if ("sum" == reduction) { } else if ("sum" == reduction) {
......
...@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
private: private:
std::vector<std::string> in_names_; std::vector<std::string> in_names_;
std::vector<std::string> out_names_; std::vector<std::string> out_names_;
paddle::lite::Predictor *engine_; paddle::lite_api::PaddlePredictor *engine_;
framework::proto::VarType::Type precision_; framework::proto::VarType::Type precision_;
bool use_gpu_; bool use_gpu_;
bool zero_copy_; bool zero_copy_;
...@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
framework::LoDTensor src_t = framework::LoDTensor src_t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, inference::analysis::GetFromScope<framework::LoDTensor>(scope,
in_names_[i]); in_names_[i]);
paddle::lite::Tensor *dst_t = engine_->GetInput(i); paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> " VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
<< engine_->GetInputNames()[i] << ")"; << engine_->GetInputNames()[i] << ")";
inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_); inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_place)) { if (platform::is_gpu_place(dev_place)) {
...@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase { ...@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
engine_->Run(); engine_->Run();
VLOG(3) << "lite engine run done"; VLOG(3) << "lite engine run done";
for (size_t i = 0; i < out_names_.size(); i++) { for (size_t i = 0; i < out_names_.size(); i++) {
paddle::lite::Tensor src_t = *(engine_->GetOutput(i)); paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
framework::LoDTensor *dst_t = framework::LoDTensor *dst_t =
&inference::analysis::GetFromScope<framework::LoDTensor>( &inference::analysis::GetFromScope<framework::LoDTensor>(
scope, out_names_[i]); scope, out_names_[i]);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册