提交 83c2d35a 编写于 作者: S seiriosPlus

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize/large_scale_kv_spped

......@@ -16,7 +16,7 @@ else()
set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
endif()
######################################################################################
......
......@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
SET(DGC_URL "http://fleet.bj.bcebos.com/collective_ef2216a.tgz")
SET(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
cache_third_party(extern_dgc
......@@ -30,7 +30,7 @@ ExternalProject_Add(
extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS}
"${DGC_DOWNLOAD_CMD}"
URL_MD5 "2f67549fd5f1262383d83289abc4f88f"
URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
PREFIX "${DGC_PREFIX_DIR}"
SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND ""
......
......@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
endif()
if(NOT CUDA_ARCH_NAME)
......
......@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn)
SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git)
SET(MKLDNN_TAG 1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
SET(MKLDNN_REPOSITORY https://github.com/oneapi-src/oneDNN.git)
SET(MKLDNN_TAG 64a48f9565aa72f6359917b3406328075a409939)
# Introduce variables:
# * CMAKE_INSTALL_LIBDIR
......
......@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc.git)
set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
set(WARPCTC_TAG fc7f226b93758216a03b1be9d24593a12819b984)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
......
......@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
endfunction()
CheckCompilerCXX11Flag()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
endif()
# safe_set_flag
#
# Set a compile flag only if compiler is support
......
......@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
ENDIF()
if(WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU)
......
......@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
for (size_t i = 0; i < op_num; ++i) {
PADDLE_ENFORCE_EQ(
grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
"Cannot serialize message.");
paddle::platform::errors::Unavailable(
"Cannot serialize operator desc message."));
}
}
return ret;
......
......@@ -36,7 +36,10 @@ message AMPConfig {
repeated string custom_black_varnames = 9;
}
message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
message LocalSGDConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ];
......@@ -52,6 +55,8 @@ message DGCConfig {
message LarsConfig {
optional float lars_coeff = 1 [ default = 0.001 ];
optional float lars_weight_decay = 2 [ default = 0.0005 ];
optional float epsilon = 3 [ default = 0.0 ];
repeated string exclude_from_weight_decay = 4;
}
message LambConfig {
......
......@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
void NCCLWrapper::InitNCCL() {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
&(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
nccl_info_.my_global_rank_));
#endif
......@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
NCCLInfo NCCLWrapper::GetNCCLId() {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
#endif
return nccl_info_;
}
......@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
nccl_info_.local_rank_ = local_rank;
nccl_info_.my_global_rank_ = global_rank;
nccl_info_.global_ranks_ = ranks;
PADDLE_ENFORCE(cudaSetDevice(local_rank));
PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
#endif
return;
}
......@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
auto var = scope.FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int32_t total_size = tensor->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
root_rank, nccl_info_.comm_, nccl_info_.stream_));
cudaStreamSynchronize(nccl_info_.stream_);
......
......@@ -42,7 +42,8 @@ void ThreadPool::Init() {
num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
"The number of threads is 0."));
threadpool_.reset(new ThreadPool(num_threads));
}
}
......@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
}
if (tasks_.empty()) {
PADDLE_THROW("This thread has no task to Run");
PADDLE_THROW(platform::errors::Unavailable(
"Current thread has no task to Run."));
}
// pop a task from the task queue
......
......@@ -91,7 +91,8 @@ class ThreadPool {
{
std::unique_lock<std::mutex> lock(mutex_);
if (!running_) {
PADDLE_THROW("enqueue on stopped ThreadPool");
PADDLE_THROW(platform::errors::Unavailable(
"Task is enqueued into stopped ThreadPool."));
}
tasks_.push(std::move(task));
}
......
......@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
} break;
default:
PADDLE_THROW(
"Setting 'sub_tensor_number' is not supported by the type of var %s.",
this->Name());
platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
"supported by the %s type variable.",
this->Name()));
}
}
......@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
break;
default:
PADDLE_THROW(
"Getting 'sub_tensor_number' is not supported by the type of var %s.",
this->Name());
platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
"supported by the %s type variable.",
this->Name()));
}
}
......@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
PADDLE_THROW(
"Setting 'lod_level' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_level' is not supported by the %s type variable.",
this->Name()));
}
}
......@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
}
} break;
default:
PADDLE_THROW(
"Setting 'lod_levels' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Setting 'lod_levels' is not supported by the %s type variable",
this->Name()));
}
}
......@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().lod_level();
default:
PADDLE_THROW(
"Getting 'lod_level' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_level' is not supported by the %s type variable.",
this->Name()));
}
}
......@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
return res;
break;
default:
PADDLE_THROW(
"Getting 'lod_levels' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'lod_levels' is not supported by the %s type variable.",
this->Name()));
}
}
const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS:
return desc_.type().selected_rows();
......@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().tensor();
default:
PADDLE_THROW(
"Getting 'tensor_desc' is not supported by the type of var %s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_desc' is not supported by the %s type variable.",
this->Name()));
}
}
std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc> res;
res.reserve(GetTensorDescNum());
switch (desc_.type().type()) {
......@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
}
return res;
default:
PADDLE_THROW(
"Getting 'tensor_descs' is not supported by the type of var "
"%s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the %s type variable.",
this->Name()));
}
}
proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
switch (desc_.type().type()) {
case proto::VarType::SELECTED_ROWS:
return desc_.mutable_type()->mutable_selected_rows();
......@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
default:
PADDLE_THROW(
"Getting 'mutable_tensor_desc' is not supported by the type of var "
"%s.",
this->Name());
platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
"supported by the %s type variable.",
this->Name()));
}
}
std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE_EQ(
desc_.has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
PADDLE_ENFORCE_EQ(
desc_.type().has_type(), true,
platform::errors::NotFound("The variable's type was not be set."));
std::vector<proto::VarType::TensorDesc *> res;
res.reserve(GetTensorDescNum());
switch (desc_.type().type()) {
......@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
}
return res;
default:
PADDLE_THROW(
"Getting 'tensor_descs' is not supported by the type of var "
"%s.",
this->Name());
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_descs' is not supported by the %s type variable.",
this->Name()));
}
}
......
......@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type);
default:
PADDLE_THROW("ToVarType:Unsupported type %d", type);
PADDLE_THROW(platform::errors::Unavailable(
"ToVarType method Unsupported type %d.", type));
}
}
......@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
visitor(var.Get<FetchList>());
return;
default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
ToTypeName(var.Type())));
}
}
......
......@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
constexpr int kId = VarTypeTrait<Type>::kId;
auto type = std::type_index(typeid(Type));
PADDLE_ENFORCE(id_to_type->count(kId) == 0,
"Registered duplicate type id %d for type %s", kId,
type.name());
PADDLE_ENFORCE(type_to_id->count(type) == 0,
"Registered duplicate type_index %s for id %d", type.name(),
kId);
PADDLE_ENFORCE_EQ(
id_to_type->count(kId), 0,
platform::errors::AlreadyExists(
"Registered duplicate type id %d for type %s.", kId, type.name()));
PADDLE_ENFORCE_EQ(
type_to_id->count(type), 0,
platform::errors::AlreadyExists(
"Registered duplicate type index %s for id %d.", type.name(), kId));
id_to_type->emplace(kId, type);
type_to_id->emplace(type, kId);
VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
......@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
public:
static const std::type_index &ToTypeIndex(int var_id) {
auto it = Instance().id_to_type_map_.find(var_id);
PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
"VarId %d is not registered.", var_id);
PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
platform::errors::NotFound(
"Variable Id %d is not registered.", var_id));
return it->second;
}
static int ToTypeId(const std::type_index &type) {
auto it = Instance().type_to_id_map_.find(type);
PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
"VarType %s is not registered.", type.name());
PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
platform::errors::NotFound(
"Variable Type %s is not registered.", type.name()));
return it->second;
}
......
......@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
PADDLE_THROW(platform::errors::Unavailable(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type);
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
var_type));
}
}
......@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
auto *dst_t = tmp_grad_slr->mutable_value();
framework::TensorCopy(src_t, cpu_place, dst_t);
} else {
PADDLE_THROW("unknown var type to copy");
PADDLE_THROW(
platform::errors::Unavailable("Unknown variable type to copy."));
}
}
......
......@@ -218,6 +218,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
// Only used in paddle-lite subgraph.
DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
int);
private:
std::unordered_set<std::string> valid_fields_;
};
......
......@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_xpu", new bool(argument->use_xpu()));
pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size()));
pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads()));
}
disable_logs_ = argument->disable_logs();
if (pass_name == "fc_fuse_pass") {
......
......@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
lite_api::TargetType target_type;
if (use_gpu) {
......@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
// Notice: The ordering here determines the device where the
// input tensor of the Lite engine is located, and then affects
// whether tensor sharing is feasible.
paddle::lite::Place({target_type, precision_type}),
paddle::lite::Place({target_type, PRECISION(kInt64)}),
paddle::lite::Place({target_type, PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
paddle::lite_api::Place({target_type, precision_type}),
paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
};
config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size;
if (dump_model) {
lite::StrToBinaryFile("./model.bin", config.model);
......
......@@ -53,12 +53,10 @@ if(WITH_TESTING)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
elseif(WIN32)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
endif()
endif()
......
......@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() {
}
if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads());
argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
argument_.SetLitePassesFilter(config_.lite_passes_filter_);
argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
......
......@@ -21,15 +21,21 @@
namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
PADDLE_ENFORCE(!name_.empty(),
PADDLE_ENFORCE_EQ(
name_.empty(), false,
platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(input_or_output_,
"Can't reshape the output tensor, it is readonly");
PADDLE_ENFORCE(scope_);
"be retrieved."));
PADDLE_ENFORCE_EQ(input_or_output_, true,
platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly"));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(shape));
}
......@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GT(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before retrieving mutable_data from input tensor.");
platform::errors::PreconditionNotMet(
"You should call ZeroCopyTensor::Reshape(const std::vector<int> "
"&shape)"
"function before retrieving mutable_data from input tensor."));
switch (static_cast<int>(place)) {
case static_cast<int>(PaddlePlace::kCPU): {
return tensor->mutable_data<T>(platform::CPUPlace());
......@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
return tensor->mutable_data<T>(platform::CUDAPlace(device_));
}
default:
PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
static_cast<int>(place)));
break;
}
return nullptr;
......@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copying data from cpu.");
PADDLE_ENFORCE_GE(tensor->numel(), 0,
platform::errors::PreconditionNotMet(
"You should call ZeroCopyTensor::Reshape(const "
"std::vector<int> &shape)"
"function before copying data from cpu."));
size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) {
......@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream());
#else
PADDLE_THROW("Not compiled with CUDA, should not reach here.");
PADDLE_THROW(platform::errors::Unavailable(
"Not compiled with CUDA, should not reach here."));
#endif
}
}
......@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
cudaStreamSynchronize(dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here."));
#endif
}
}
......@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(),
PADDLE_ENFORCE_EQ(
name_.empty(), false,
platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(scope_);
"be retrieved."));
PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
"The scope should not be nullptr."));
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<framework::LoDTensor>();
return tensor;
}
std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
PADDLE_ENFORCE_NOT_NULL(
tensor_, platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_));
return framework::vectorize<int>(tensor->dims());
}
......
......@@ -31,12 +31,30 @@ limitations under the License. */
#include "paddle_analysis_config.h" // NOLINT
#include "paddle_api.h" // NOLINT
///
/// \file paddle_inference_api.h
///
/// \brief Paddle Inference API
///
/// \author paddle-infer@baidu.com
/// \date 2020-09-01
/// \since 2.0.0-beta
///
namespace paddle_infer {
using DataType = paddle::PaddleDType;
using PlaceType = paddle::PaddlePlace;
using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig;
///
/// \class Tensor
///
/// \brief Represents an n-dimensional array of values.
/// The Tensor is used to store the input or output of the network.
/// It is obtained through Predictor::GetinputHandle()
/// and Predictor::GetOutputHandle() interface.
///
class PD_INFER_DECL Tensor {
public:
// Can only be created by predictor->GetInputHandle(cosnt std::string& name)
......@@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor {
Tensor() = delete;
explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
: tensor_(std::move(tensor)) {}
///
/// \brief Reset the shape of the tensor.
/// Generally it's only used for the input tensor.
/// Reshape must be called before calling mutable_data() or CopyFromCpu()
/// \param shape The shape to set.
///
void Reshape(const std::vector<int>& shape);
///
/// \brief Copy the host memory to tensor data.
/// It's usually used to set the input tensor data.
/// \param data The pointer of the data, from which the tensor will copy.
///
template <typename T>
void CopyFromCpu(const T* data);
// should add the place
///
/// \brief Get the memory pointer in CPU or GPU with specific data type.
/// Please Reshape the tensor first before call this.
/// It's usually used to get input data pointer.
/// \param place The place of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T>
T* mutable_data(PlaceType place);
///
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
///
template <typename T>
void CopyToCpu(T* data);
///
/// \brief Get the memory pointer directly.
/// It's usually used to get the output data pointer.
/// \param[out] place To get the device type of the tensor.
/// \param[out] size To get the data size of the tensor.
/// \return The tensor data buffer pointer.
///
template <typename T>
T* data(PlaceType* place, int* size) const;
///
/// \brief Set lod info of the tensor.
/// More about LOD can be seen here:
/// https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
/// \param x the lod info.
///
void SetLoD(const std::vector<std::vector<size_t>>& x);
/// \brief Return the lod info of the tensor.
std::vector<std::vector<size_t>> lod() const;
/// \brief Return the data type of the tensor.
/// It's usually used to get the output tensor data type.
/// \return The data type of the tensor.
DataType type() const;
/// \brief Return the shape of the Tensor.
std::vector<int> shape() const;
/// \brief Return the name of the tensor.
const std::string& name() const;
private:
std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
};
///
/// \class Predictor
///
/// \brief Predictor is the interface for model prediction.
///
/// The predictor has the following typical uses:
///
/// Get predictor
/// \code{cpp}
/// auto predictor = CreatePredictor(config);
/// \endcode
///
/// Get input or output names
/// \code{cpp}
/// auto input_names = predictor->GetInputNames();
/// auto output_names = predictor->GetOutputNames();
/// \endcode
///
/// Get input or output handle
/// \code{cpp}
/// auto input_t = predictor->GetInputHandle(input_names[0]);
/// auto output_t = predictor->GetOutputHandle(output_names[0]);
/// \endcode
///
/// Run predictor
/// \code{cpp}
/// predictor->Run();
/// \endcode
///
class PD_INFER_DECL Predictor {
public:
Predictor() = delete;
......@@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor {
explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
: predictor_(std::move(pred)) {}
///
/// \brief Construct a new Predictor object
///
/// \param[in] Config config
///
explicit Predictor(const Config& config);
///
/// \brief Get the input names
///
/// \return input names
///
std::vector<std::string> GetInputNames();
///
/// \brief Get the Input Tensor object
///
/// \param[in] name input name
/// \return input tensor
///
std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
///
/// \brief Run the prediction engine
///
/// \return Whether the function executed successfully
///
bool Run();
///
/// \brief Get the output names
///
/// \return output names
///
std::vector<std::string> GetOutputNames();
///
/// \brief Get the Output Tensor object
///
/// \param[in] name otuput name
/// \return output tensor
///
std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
///
/// \brief Clone to get the new predictor. thread safe.
///
/// \return get a new predictor
///
std::unique_ptr<Predictor> Clone();
/// \brief Clear the intermediate tensors of the predictor
void ClearIntermediateTensor();
private:
std::unique_ptr<paddle::PaddlePredictor> predictor_;
};
///
/// \brief A factory to help create predictors.
///
/// Usage:
///
/// \code{.cpp}
/// Config config;
/// ... // change the configs.
/// auto predictor = CreatePredictor(config);
/// \endcode
///
PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
const Config& config); // NOLINT
PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
PD_INFER_DECL std::string GetVersion();
......@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
namespace paddle_infer {
namespace services {
///
/// \class PredictorPool
///
/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
/// use in multi-threaded situations. According to the thread id, the
/// corresponding Predictor is taken out from PredictorPool to complete the
/// prediction.
///
class PD_INFER_DECL PredictorPool {
public:
PredictorPool() = delete;
PredictorPool(const PredictorPool&) = delete;
PredictorPool& operator=(const PredictorPool&) = delete;
/// \brief Construct the predictor pool with \param size predictor instances.
explicit PredictorPool(const Config& config, size_t size = 1);
/// \brief Get \param id-th predictor.
Predictor* Retrive(size_t idx);
private:
......
......@@ -16,6 +16,7 @@
#include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
}
void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Resize(length);
}
void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
buf->buf.Reset(data, length);
}
bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.empty();
}
void* PD_PaddleBufData(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.data();
}
size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(buf);
PADDLE_ENFORCE_NOT_NULL(buf,
paddle::platform::errors::InvalidArgument(
"The pointer of Buffer shouldn't be nullptr"));
return buf->buf.length();
}
......
......@@ -18,7 +18,6 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using PD_PaddleDType = paddle::PaddleDType;
using PD_ACPrecision = paddle::AnalysisConfig::Precision;
......
......@@ -20,6 +20,7 @@
#include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
const char* params_path) {
LOG(INFO) << model_dir;
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
LOG(INFO) << std::string(model_dir);
if (!params_path) {
config->config.SetModel(std::string(model_dir));
......@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
}
void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetProgFile(std::string(x));
}
void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetParamsFile(std::string(x));
}
void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetOptimCacheDir(std::string(opt_cache_dir));
}
const char* PD_ModelDir(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_dir().c_str();
}
const char* PD_ProgFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.prog_file().c_str();
}
const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.params_file().c_str();
}
void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
int device_id) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
device_id);
}
void PD_DisableGpu(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.DisableGpu();
}
bool PD_UseGpu(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_gpu();
}
int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.gpu_device_id();
}
int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.memory_pool_init_size_mb();
}
float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.fraction_of_gpu_memory_for_pool();
}
void PD_EnableCUDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableCUDNN();
}
bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cudnn_enabled();
}
void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrOptim(x);
}
bool PD_IrOptim(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.ir_optim();
}
void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchUseFeedFetchOps(x);
}
bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.use_feed_fetch_ops_enabled();
}
void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchSpecifyInputNames(x);
}
bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.specify_input_name();
}
......@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
int max_batch_size, int min_subgraph_size,
Precision precision, bool use_static,
bool use_calib_mode) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableTensorRtEngine(
workspace_size, max_batch_size, min_subgraph_size,
paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
}
bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.tensorrt_engine_enabled();
}
void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SwitchIrDebug(x);
}
void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMKLDNN();
}
void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetMkldnnCacheCapacity(capacity);
}
bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_enabled();
}
void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
int cpu_math_library_num_threads) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
}
int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.cpu_math_library_num_threads();
}
void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnQuantizer();
}
bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_quantizer_enabled();
}
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMkldnnBfloat16();
}
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.mkldnn_bfloat16_enabled();
}
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
params_buffer_size);
}
bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.model_from_memory();
}
void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableMemoryOptim();
}
bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.enable_memory_optim();
}
void PD_EnableProfile(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableProfile();
}
bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.profile_enabled();
}
void PD_SetInValid(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.SetInValid();
}
bool PD_IsValid(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.is_valid();
}
......
......@@ -22,6 +22,7 @@
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -81,7 +82,10 @@ extern "C" {
bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
int in_size, PD_Tensor** output_data, int* out_size,
int batch_size) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
VLOG(3) << "Predoctor: PD_PredictorRun. ";
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors;
......@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
PD_ZeroCopyData* inputs, int in_size,
PD_ZeroCopyData** output, int* out_size) {
PADDLE_ENFORCE_NOT_NULL(config);
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
predictors;
if (!predictors.count(config->config.model_dir())) {
......@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
break;
default:
CHECK(false) << "Unsupport data type.";
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unsupported data type."));
break;
}
}
......@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
break;
default:
CHECK(false) << "Unsupport data type.";
PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break;
}
......@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
break;
default:
CHECK(false) << "Unsupport data type.";
PADDLE_THROW(
paddle::platform::errors::InvalidArgument("Unsupported data type."));
break;
}
}
......
......@@ -19,6 +19,7 @@
#include <vector>
#include "paddle/fluid/inference/capi/c_api_internal.h"
#include "paddle/fluid/inference/capi/paddle_c_api.h"
#include "paddle/fluid/platform/enforce.h"
using paddle::ConvertToACPrecision;
using paddle::ConvertToPaddleDType;
......@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
}
void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.name = std::string(name);
}
void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
}
void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.data = buf->buf;
}
void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
tensor->tensor.shape.assign(shape, shape + size);
}
const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return tensor->tensor.name.c_str();
}
PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
return ConvertToPDDataType(tensor->tensor.dtype);
}
PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
PD_PaddleBuf* ret = PD_NewPaddleBuf();
ret->buf = tensor->tensor.data;
return ret;
}
const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
PADDLE_ENFORCE_NOT_NULL(tensor);
PADDLE_ENFORCE_NOT_NULL(tensor,
paddle::platform::errors::InvalidArgument(
"The pointer of tensor shouldn't be nullptr"));
const std::vector<int>& shape = tensor->tensor.shape;
*size = shape.size();
return shape.data();
......
......@@ -20,8 +20,12 @@
#define LITE_WITH_XPU 1
#endif
#ifndef PADDLE_WITH_ARM
#define LITE_WITH_X86 1
#endif
#include "paddle/fluid/inference/lite/engine.h"
#include "lite/api/paddle_use_passes.h"
#include <utility>
namespace paddle {
namespace inference {
......@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
return engines_.at(name).get() != nullptr;
}
paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
paddle::lite_api::PaddlePredictor* EngineManager::Get(
const std::string& name) const {
return engines_.at(name).get();
}
paddle::lite::Predictor* EngineManager::Create(const std::string& name,
const EngineConfig& cfg) {
if (cfg.valid_places.front().target == TARGET(kCUDA)) {
#ifdef PADDLE_WITH_CUDA
paddle::lite::Env<TARGET(kCUDA)>::Init();
paddle::lite_api::PaddlePredictor* EngineManager::Create(
const std::string& name, const EngineConfig& cfg) {
// config info for predictor.
paddle::lite_api::CxxConfig lite_cxx_config;
lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
cfg.param.c_str(), cfg.param.size());
lite_cxx_config.set_valid_places(cfg.valid_places);
#ifdef PADDLE_WITH_ARM
set_threads.set_threads(cfg.cpu_math_library_num_threads);
#else
lite_cxx_config.set_x86_math_library_num_threads(
cfg.cpu_math_library_num_threads);
#endif
} else if (cfg.valid_places.front().target == TARGET(kXPU)) {
#ifdef PADDLE_WITH_XPU
paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
cfg.xpu_l3_workspace_size;
lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
cfg.xpu_l3_workspace_size);
#endif
}
auto* p = new paddle::lite::Predictor();
p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
cfg.model_type, cfg.model_from_memory);
engines_[name].reset(p);
return p;
// create predictor
std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
engines_[name] = std::move(p);
return engines_[name].get();
}
void EngineManager::DeleteAll() {
for (auto& item : engines_) {
item.second.reset(nullptr);
item.second.reset();
}
}
......
......@@ -23,12 +23,9 @@
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wall"
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_place.h"
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/memory.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/api/paddle_use_passes.h"
#pragma GCC diagnostic pop
namespace paddle {
......@@ -38,25 +35,33 @@ namespace lite {
struct EngineConfig {
std::string model;
std::string param;
paddle::lite::Place prefer_place;
std::vector<paddle::lite::Place> valid_places;
std::vector<paddle::lite_api::Place> valid_places;
std::vector<std::string> neglected_passes;
lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
bool model_from_memory{true};
// for xpu
size_t xpu_l3_workspace_size;
// for x86 or arm
int cpu_math_library_num_threads{1};
// for cuda
bool use_multi_stream{false};
};
class EngineManager {
public:
bool Empty() const;
bool Has(const std::string& name) const;
paddle::lite::Predictor* Get(const std::string& name) const;
paddle::lite::Predictor* Create(const std::string& name,
paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
paddle::lite_api::PaddlePredictor* Create(const std::string& name,
const EngineConfig& cfg);
void DeleteAll();
private:
std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
std::unordered_map<std::string,
std::shared_ptr<paddle::lite_api::PaddlePredictor>>
engines_;
};
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/lite/tensor_utils.h"
#include <functional>
#include <map>
#include <memory>
#include "paddle/fluid/framework/data_type.h"
......@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
}
}
void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
PrecisionType precision_type,
TargetType target_type) {
void* res{nullptr};
switch (precision_type) {
case PrecisionType::kFloat:
res = static_cast<void*>(src->mutable_data<float>(target_type));
break;
case PrecisionType::kInt8:
res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
break;
case PrecisionType::kInt32:
res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
break;
case PrecisionType::kInt64:
res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported precision type. Now only supports FP32, INT8, INT32 and "
"INT64."));
break;
}
return res;
}
int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
auto shape = tensor.shape();
int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
return numel;
}
void InitDstTensor(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src) {
// Currently, Lite needs to explicitly specify the target type of
// the input tensor.
constexpr int empty_size = 0;
dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
dst->set_precision(GetLitePrecisionType(src.type()));
SetLoD(dst->mutable_lod(), src.lod());
dst->Resize({empty_size});
GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
GetLiteTargetType(src.place()));
dst->SetPrecision(GetLitePrecisionType(src.type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src.lod());
dst->SetLoD(lite_lod);
}
void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
void InitDstTensor(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src) {
constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32;
dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
......@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
}
template <>
void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
void TensorCopyAsync(paddle::lite_api::Tensor* dst,
const framework::LoDTensor& src,
const platform::DeviceContext& ctx) {
InitDstTensor(dst, src);
const platform::Place& src_place = src.place();
......@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
dst->Resize(framework::vectorize(src.dims()));
const void* src_data = src.data<void>();
void* dst_data = dst->mutable_data(bytes);
void* dst_data{nullptr};
dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
GetLiteTargetType(src.place()));
VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << src.type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
VLOG(3) << "[Lite memory size] Bytes = " << bytes;
}
template <>
void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
void TensorCopyAsync(framework::LoDTensor* dst,
const paddle::lite_api::Tensor& src,
const platform::DeviceContext& ctx) {
dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
dst->Resize(paddle::framework::make_ddim(src.shape()));
InitDstTensor(dst, src);
const platform::Place& src_place = GetNativePlace(src.target());
const platform::Place& dst_place = dst->place();
const size_t bytes =
static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
const void* src_data = src.raw_data();
int64_t src_numel = GetLiteTensorNumel(src);
const size_t bytes = src_numel * framework::SizeOfType(dst->type());
const void* src_data = src.data<void>();
// When Lite is ready, the source type needs to be modified here.
void* dst_data = dst->mutable_data(dst_place, dst->type());
VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
<< ", dst = " << dst << ", src_type = " << dst->type();
MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
VLOG(3) << "[Lite memory size] Bytes = " << bytes;
}
template <>
void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
const size_t bytes =
static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
dst->Resize(framework::vectorize(src->dims()));
dst->set_precision(GetLitePrecisionType(src->type()));
SetLoD(dst->mutable_lod(), src->lod());
dst->ResetBuffer(buf, bytes);
dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
GetLiteTargetType(src->place()));
dst->SetPrecision(GetLitePrecisionType(src->type()));
paddle::lite::LoD lite_lod;
SetLoD(&lite_lod, src->lod());
dst->SetLoD(lite_lod);
}
template <>
void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
constexpr framework::proto::VarType::Type dtype =
framework::proto::VarType_Type_FP32;
void* src_raw_data = src->raw_data();
void* src_raw_data =
GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
std::shared_ptr<memory::allocation::Allocation> holder(
new memory::allocation::Allocation(src_raw_data, src->memory_size(),
new memory::allocation::Allocation(src_raw_data, memory_size,
GetNativePlace(src->target())));
dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
dst->Resize(paddle::framework::make_ddim(src->shape()));
SetLoD(dst->mutable_lod(), src->lod());
dst->ResetHolderWithType(holder, dtype);
}
......
......@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
config.model_from_memory = true;
config.valid_places = {
#ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
};
LOG(INFO) << "Create EngineManager";
......@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
unique_key),
true);
paddle::lite::Predictor* engine_0 =
paddle::lite_api::PaddlePredictor* engine_0 =
inference::Singleton<inference::lite::EngineManager>::Global().Get(
unique_key);
CHECK_NOTNULL(engine_0);
......
......@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
}
template <typename T>
void test_lite_tensor_data_ptr(PrecisionType precision_type) {
void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
PrecisionType precision_type,
TargetType target_type);
const int count = 4;
paddle::lite::Tensor lite_tensor;
lite_tensor.Resize({count});
auto* lite_tensor_data = lite_tensor.mutable_data<T>();
for (size_t i = 0; i < count; ++i) {
lite_tensor_data[i] = i;
}
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
T* data = static_cast<T*>(GetLiteTensorDataPtr(
&lite_api_tensor, precision_type, TargetType::kHost));
for (size_t i = 0; i < count; ++i) {
CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
}
}
TEST(LiteEngineOp, GetLiteTensorDataPtr) {
test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
}
void test_tensor_copy(const platform::DeviceContext& ctx) {
// Create LoDTensor.
std::vector<float> vector({1, 2, 3, 4});
......@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod);
// Create lite::Tensor and copy.
paddle::lite::Tensor lite_tensor;
TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
// Copy to LoDTensor.
framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::GpuStreamSync(
......@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
lod_tensor.set_lod(lod);
// Create lite::Tensor and share.
paddle::lite::Tensor lite_tensor;
TensorDataShare(&lite_tensor, &lod_tensor);
paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
TensorDataShare(&lite_api_tensor, &lod_tensor);
// Copy to LoDTensor.
framework::LoDTensor lod_tensor_n;
TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
std::vector<float> result;
TensorToVector(lod_tensor_n, ctx, &result);
ASSERT_EQ(result, vector);
......
......@@ -63,9 +63,11 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
void TensorRTEngine::FreezeNetwork() {
freshDeviceId();
VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE(infer_builder_ != nullptr,
"Call InitNetwork first to initialize network.");
PADDLE_ENFORCE_EQ(network() != nullptr, true,
PADDLE_ENFORCE_NOT_NULL(infer_builder_,
platform::errors::InvalidArgument(
"Inference builder of TRT is null. Please make "
"sure you call InitNetwork first."));
PADDLE_ENFORCE_NOT_NULL(network(),
platform::errors::InvalidArgument(
"Call InitNetwork first to initialize network."));
// build engine.
......@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
} else {
infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
}
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
PADDLE_ENFORCE_NOT_NULL(
infer_engine_, platform::errors::Fatal(
"Build TensorRT cuda engine failed! Please recheck "
"you configurations related to paddle-TensorRT."));
}
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
......@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
platform::errors::InvalidArgument(
"The TRT network should be initialized first."));
auto *input = network()->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
PADDLE_ENFORCE(input->isNetworkInput());
PADDLE_ENFORCE_NOT_NULL(
input, platform::errors::InvalidArgument("Adding input %s failed in "
"TensorRT inference network. "
"Please recheck your input.",
name));
PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
platform::errors::InvalidArgument(
"Input %s is not the input of TRT inference network. "
"Please recheck your input.",
name));
TensorRTEngine::SetITensor(name, input);
return input;
}
......@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string &name) {
auto *output = layer->getOutput(offset);
SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr);
PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output);
PADDLE_ENFORCE(output->isNetworkOutput());
PADDLE_ENFORCE_EQ(
output->isNetworkOutput(), true,
platform::errors::InvalidArgument(
"The output %s of TRT engine should be the output of the network.",
name));
}
void TensorRTEngine::DeclareOutput(const std::string &name) {
auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr);
PADDLE_ENFORCE_NOT_NULL(
output, platform::errors::InvalidArgument(
"The output %s of TRT engine should not be null.", name));
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
platform::errors::InvalidArgument(
"The output %s of TRT engine should not be the input "
"of the network at the same time.",
name));
network()->markOutput(*output);
}
void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr);
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
name);
PADDLE_ENFORCE_NOT_NULL(
tensor, platform::errors::InvalidArgument(
"Tensor named %s of TRT engine should not be null.", name));
PADDLE_ENFORCE_EQ(
0, itensor_map_.count(name),
platform::errors::InvalidArgument(
"Tensor named %s of TRT engine should not be duplicated", name));
itensor_map_[name] = tensor;
}
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
platform::errors::NotFound(
"Tensor named %s is not found in TRT engine", name));
return itensor_map_[name];
}
......@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
std::string splitter = "__";
std::string name_with_suffix = name + splitter + name_suffix;
platform::CPUPlace cpu_place;
PADDLE_ENFORCE_EQ(
weight_map.count(name_with_suffix), 0,
"During TRT Op converter: We set weight %s with the same name "
"twice into the weight_map",
name_with_suffix);
PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
platform::errors::AlreadyExists(
"The weight named %s is set into the weight map "
"twice in TRT OP converter.",
name_with_suffix));
weight_map[name_with_suffix].reset(new framework::Tensor());
weight_map[name_with_suffix]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
......@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, count);
PADDLE_ENFORCE_LT(device_id_, count,
platform::errors::OutOfRange(
"Device id %d exceeds the current device count: %d.",
device_id_, count));
cudaSetDevice(device_id_);
}
......
......@@ -196,8 +196,10 @@ class TensorRTEngine {
}
nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE(infer_engine_ != nullptr,
"You should build engine first and then serialize");
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"The TensorRT engine must be built first before serialization"));
ihost_memory_.reset(infer_engine_->serialize());
return ihost_memory_.get();
}
......@@ -222,8 +224,14 @@ class TensorRTEngine {
engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
}
PADDLE_ENFORCE(infer_engine_ != nullptr,
"build cuda engine failed when deserialize engine info.!");
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
}
void SetRuntimeBatch(size_t batch_size);
......
......@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims *input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0);
PADDLE_ENFORCE_EQ(num_inputs, 2);
PADDLE_ENFORCE_NOT_NULL(input_dims);
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"There is only one output in TRT elementwise "
"op plugin, but got output index: %d.",
index));
PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
"There are 2 inputs in TRT elementwise "
"op plugin, but got input number: %d.",
num_inputs));
PADDLE_ENFORCE_NOT_NULL(
input_dims,
platform::errors::InvalidArgument(
"The input dims of TRT elementwise op plugin should not be null."));
return input_dims[0];
}
int ElementWisePlugin::initialize() {
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
platform::errors::InvalidArgument(
"The dimension of input Y of TRT elementwise op plugin "
"should be greater than 0, but got %d.",
dims_y_.nbDims));
axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
int trimed_nb_dims = dims_y_.nbDims;
......@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
}
dims_y_.nbDims = trimed_nb_dims;
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
platform::errors::InvalidArgument(
"We expect [number of x dims] >= [number of y dims + "
"axis] in TRT elementwise op plugin, but got [number "
"of x dims] = %d, [number of y dims + axis] = %d.",
dims_x_.nbDims, dims_y_.nbDims + axis_));
PADDLE_ENFORCE_LT(
axis_, dims_x_.nbDims,
platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
"in TRT elementwise op plugin, but got "
"[axis] = %d, [number of x dims] = %d.",
axis_, dims_x_.nbDims));
prev_size_ = 1;
midd_size_ = 1;
......@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
for (int i = 0; i < dims_y_.nbDims; ++i) {
PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
"Broadcast dimension mismatch.");
platform::errors::InvalidArgument(
"Broadcast dimension mismatch. The dims of input Y "
"should be a subsequence of X."));
midd_size_ *= dims_y_.d[i];
}
......@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
elementwise_kernel<<<block, thread, 0, stream>>>(
num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
} else {
PADDLE_THROW("Not implemented.");
PADDLE_THROW(platform::errors::Unimplemented(
"Paddle-TRT only support elementwise operation: {add, mul} currently, "
"but got %s.",
type_));
}
return cudaGetLastError() != cudaSuccess;
......
......@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
nvinfer1::DimsCHW{1, 1, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y");
LOG(INFO) << "freeze network";
......@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
nvinfer1::DimsCHW{1, 2, 1});
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(fc_layer,
platform::errors::InvalidArgument(
"TRT fully connected layer building failed."));
engine_->DeclareOutput(fc_layer, 0, "y");
engine_->FreezeNetwork();
......@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
auto *conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(conv_layer,
platform::errors::InvalidArgument(
"TRT convolution layer building failed."));
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
......@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr);
PADDLE_ENFORCE_NOT_NULL(
pool_layer,
platform::errors::InvalidArgument("TRT pooling layer building failed."));
pool_layer->setStride(nvinfer1::DimsHW{1, 1});
pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
......
......@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
engine_name_, it.first));
}
const auto& d = dataptr->second;
PADDLE_ENFORCE(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
"Fail to cudaMemcpy %s for %s", engine_name_, it.first);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
}
data_is_set_ = true;
......
......@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
### Lexcial analysis GRU model
set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
......
......@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config);
......
......@@ -45,7 +45,9 @@ endif()
SET(OP_HEADER_DEPS xxhash executor)
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
endif()
endif()
SET(OP_PREFETCH_DEPS "")
......
......@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace operators {
class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
public:
AmpCheckFiniteAndScaleOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
CheckFiniteAndUnscaleOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
"amp_check_finite_and_unscale");
"check_finite_and_unscale");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"amp_check_finite_and_unscale");
"check_finite_and_unscale");
PADDLE_ENFORCE_EQ(
ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
platform::errors::InvalidArgument(
"The input(X) and output(Out) should have same size in "
"Operator(amp_check_finite_and_unscale), size of input(X) is %d "
"Operator(check_finite_and_unscale), size of input(X) is %d "
"and size of output(Out) is %d.",
ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
auto x_dims = ctx->GetInputsDim("X");
......@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensors) The input tensors of amp_check_finite_and_scale operator.")
"(Tensors) The input tensors of check_finite_and_unscale operator.")
.AsDuplicable();
AddInput("Scale",
"(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale "
"(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator.");
AddOutput("Out",
"(Tensors) The scaled output tensor of "
"amp_check_finite_and_unscale operator.")
"check_finite_and_unscale operator.")
.AsDuplicable();
AddOutput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"if there there is infinite or nan item in input X.");
AddComment(R"DOC(
amp_check_finite_and_scale operator.
check_finite_and_unscale operator.
Check if input X contains all finite data, if yes, scale it by input Scale.
$$Out = X * scale$$
$$Out = X / scale$$
If any tensor in X contains Inf or Nan, the Out will generate a indicator.
FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
......@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False).
}
};
template <typename T>
class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
T inverse_scale = Inverse<T>(*scale_data);
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
*found_inf_data = !(*is_finite_data);
}
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
if (!(*found_inf_data)) {
eigen_out.device(dev) = eigen_in * inverse_scale;
} else {
eigen_out.device(dev) = eigen_in * static_cast<T>(0);
}
}
return;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp,
ops::AmpCheckFiniteAndScaleOpMaker,
check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
ops::CheckFiniteAndUnscaleOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
amp_check_finite_and_scale,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
double>);
REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleCpuKernel<float>,
ops::CheckFiniteAndUnscaleCpuKernel<double>);
......@@ -14,28 +14,31 @@ limitations under the License. */
#include <cuda.h>
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
__global__ void GpuInverse(const T* s, T* o) {
*o = Inverse<T>(*s);
}
template <typename T>
__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
bool* found_inf, T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) {
if (!isfinite(in[idx])) {
*found_inf = 1;
*found_inf = true;
}
out[idx] = *found_inf ? in[idx] : in[idx] * scale[0];
out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
}
}
template <typename T>
class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
......@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
framework::Tensor inverse_scale =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
T* inverse_scale_v = inverse_scale.template data<T>();
GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
......@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = x->numel();
int block = 512;
int block = 1024;
int grid = (num + block - 1) / block;
VLOG(3) << "launch kernel";
AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, scale_data, num, found_inf_data, out_data);
CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
x_data, inverse_scale_v, num, found_inf_data, out_data);
VLOG(3) << "finish kernel";
}
}
......@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
amp_check_finite_and_scale,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
float>,
ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
double>);
REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleGpuKernel<float>,
ops::CheckFiniteAndUnscaleGpuKernel<double>);
......@@ -16,51 +16,16 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/isfinite_op.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
*found_inf_data = false;
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
bool* is_finite_data = is_finite.template data<bool>();
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
if (!(*found_inf_data)) {
framework::TensorIsfinite(*x, &is_finite);
if (*is_finite_data) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*x);
eigen_out.device(dev) = (*scale_data) * eigen_in;
} else {
*found_inf_data = true;
break;
}
}
}
return;
}
};
template <typename T>
inline HOSTDEVICE T Inverse(T s) {
return 1.0 / s;
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UpdateLossScalingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
"update_loss_scaling");
OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
"update_loss_scaling");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->SetOutputDim("LossScaling", {1});
ctx->SetOutputDim("OutGoodSteps", {1});
ctx->SetOutputDim("OutBadSteps", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
ctx.device_context());
}
};
class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensors) The input tensors of update_loss_scaling operator.")
.AsDuplicable();
AddInput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"whether there is any infinite gradient.");
AddInput("PrevLossScaling",
"(Tensor) 1-dim tensor, previous loss scaling.");
AddInput("InGoodSteps",
"(Tensor) 1-dim tensor, accumulates good steps in which all "
"gradients are finite.");
AddInput("InBadSteps",
"(Tensor) 1-dim tensor, accumulates bad steps in which some "
"gradients are infinite.");
AddOutput("Out",
"(Tensors) The output tensor of update_loss_scaling operator.")
.AsDuplicable();
AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
AddAttr<int>("incr_every_n_steps",
"A value represents increasing loss scaling every n "
"consecutive steps with finite gradients.");
AddAttr<int>("decr_every_n_nan_or_inf",
"A value represents decreasing loss scaling every n "
"accumulated steps with nan or inf gradients.");
AddAttr<float>("incr_ratio",
"The multiplier to use when increasing the loss scaling.")
.AddCustomChecker([](float incr_ratio) {
PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be greater than 1, but "
"the received is %f",
incr_ratio));
});
AddAttr<float>(
"decr_ratio",
"The less-than-one-multiplier to use when decreasing loss scaling.")
.AddCustomChecker([](float decr_ratio) {
PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
platform::errors::InvalidArgument(
"'incr_ratio' should be between 0 and 1, but "
"the received is %f",
decr_ratio));
});
AddComment(R"DOC(
Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite.
)DOC");
}
};
template <typename T>
class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data,
bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
if (*found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
std::memset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(
update_loss_scaling, ops::UpdateLossScalingOp,
ops::UpdateLossScalingOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<CPU, float>,
ops::UpdateLossScalingKernel<CPU, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void GpuUpdateLossScaling(
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
template <typename T>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data, const T* pre_loss_scaling_data,
const int* good_in_data, const int* bad_in_data,
const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const {
GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
updated_loss_scaling_data, good_out_data, bad_out_data);
}
};
template <typename T>
class LazyZeroInputs<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
bool has_inf{false};
memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
sizeof(bool), dev_ctx.stream());
if (has_inf) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
cudaMemset(out_data, 0, num * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using GPU = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
ops::UpdateLossScalingKernel<GPU, float>,
ops::UpdateLossScalingKernel<GPU, double>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
HOSTDEVICE void Update(const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf,
const float incr_ratio, const float decr_ratio,
T* updated_loss_scaling_data, int* good_out_data,
int* bad_out_data) {
if (*found_inf_data) {
*good_out_data = 0;
*bad_out_data = *bad_in_data + 1;
if (*bad_out_data == decr_every_n_nan_or_inf) {
T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
*updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
? static_cast<T>(1)
: new_loss_scaling;
*bad_out_data = 0;
}
} else {
*bad_out_data = 0;
*good_out_data = *good_in_data + 1;
if (*good_out_data == incr_every_n_steps) {
T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
*updated_loss_scaling_data = std::isfinite(new_loss_scaling)
? new_loss_scaling
: *pre_loss_scaling_data;
*good_out_data = 0;
}
}
}
template <typename DeviceContext, typename T>
class UpdateLossScalingFunctor {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const T* pre_loss_scaling_data, const int* good_in_data,
const int* bad_in_data, const int incr_every_n_steps,
const int decr_every_n_nan_or_inf, const float incr_ratio,
const float decr_ratio, T* updated_loss_scaling_data,
int* good_out_data, int* bad_out_data) const;
};
template <typename DeviceContext, typename T>
class LazyZeroInputs {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const;
};
template <typename DeviceContext, typename T>
class UpdateLossScalingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
platform::errors::InvalidArgument(
"FoundInfinite must has only one element."));
const bool* found_inf_data = found_inf->data<bool>();
const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
const int* good_in_data = good_in->data<int>();
const int* bad_in_data = bad_in->data<int>();
auto& dev_ctx = ctx.template device_context<DeviceContext>();
T* updated_loss_scaling_data =
updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
const int decr_every_n_nan_or_inf =
ctx.Attr<int>("decr_every_n_nan_or_inf");
const float incr_ratio = ctx.Attr<float>("incr_ratio");
const float decr_ratio = ctx.Attr<float>("decr_ratio");
UpdateLossScalingFunctor<DeviceContext, T>{}(
dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
}
};
} // namespace operators
} // namespace paddle
......@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place
bool force_cpu = ctx.Attr<bool>("force_cpu");
kt.place_ = force_cpu ? platform::CPUPlace()
: ctx.Input<framework::LoDTensor>("X")->place();
if (force_cpu) {
kt.place_ = platform::CPUPlace();
} else {
if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
typeid(platform::CUDAPinnedPlace)) {
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
} else {
kt.place_ = ctx.GetPlace();
}
}
return kt;
}
};
......
......@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
set(TMPDEPS memory)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
set(TMPDEPS memory cub)
endif()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
......
......@@ -176,15 +176,27 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
} else if (p == INFINITY || p == -INFINITY) {
// p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
// j!=i, or equals to sign(z_i) * dout if j=i.
grad_t.device(place) =
(x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
if (platform::is_cpu_place(context.GetPlace())) {
grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
.template cast<T>() *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
.template cast<T>() *
sign * out_grad_t.broadcast(out_bcast_dims);
}
} else {
// dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
if (platform::is_cpu_place(context.GetPlace())) {
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
sign.eval() * out_grad_t.broadcast(out_bcast_dims);
} else {
grad_t.device(place) =
(x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
out_grad_t.broadcast(out_bcast_dims);
}
}
Eigen::DSizes<int, Rank * 2> x_reshape_dims;
Eigen::DSizes<int, Rank * 2> y_reshape_dims;
......
......@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
REGISTER_OP_CPU_KERNEL(
elementwise_floordiv,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
int64_t>);
......@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_floordiv,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
......@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
......@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) {
int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims();
auto y_dims = y->dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
}
}
template <typename DeviceContext, typename T>
......
......@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
using mkldnn::memory;
auto CanMKLDNNElementwiseMulBeUsed = [&]() {
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto y_dims = ctx.Input<Tensor>("Y")->dims();
int rankdiff = x_dims.size() - y_dims.size();
// TODO(jczaja): Remove this when oneDNN performance for scalar
// broadcasting
// is improved (Ernie large situation)
if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
return false;
}
return true;
};
if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
if (platform::CanMKLDNNBeUsed(ctx)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("ShapeTensor",
"(Tensor<int>), optional). The shape of the output."
"It has a higher priority than Attr(shape).")
.AsDispensable();
AddInput("ShapeTensorList",
"(vector<Tensor<int>>, optional). The shape of the output. "
"It has a higher priority than Attr(shape)."
"The shape of the element in vector must be [1].")
.AsDuplicable()
.AsDispensable();
AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output")
.SetDefault({});
AddAttr<int>("dtype", "The data type of output tensor, Default is float")
.SetDefault(framework::proto::VarType::FP32);
AddOutput("Out", "(Tensor) The output tensor.");
AddComment(R"DOC(empty operator
Returns a tensor filled with uninitialized data. The shape of the tensor is
defined by the variable argument shape.
The type of the tensor is specify by `dtype`.
)DOC");
}
};
class EmptyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* context) const override {
OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
if (context->HasInput("ShapeTensor")) {
auto dims = context->GetInputDim("ShapeTensor");
int num_ele = 1;
for (int i = 0; i < dims.size(); ++i) {
num_ele *= dims[i];
}
context->SetOutputDim("Out", framework::make_ddim({num_ele}));
} else if (context->HasInputs("ShapeTensorList")) {
std::vector<int> out_dims;
auto dims_list = context->GetInputsDim("ShapeTensorList");
for (size_t i = 0; i < dims_list.size(); ++i) {
auto& dims = dims_list[i];
PADDLE_ENFORCE_EQ(
dims, framework::make_ddim({1}),
"ShapeError: The shape of Tensor in list must be [1]. "
"But received the shape "
"is [%s]",
dims);
out_dims.push_back(dims[0]);
}
context->SetOutputDim("Out", framework::make_ddim(out_dims));
} else {
auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
context->SetOutputDim("Out", framework::make_ddim(shape));
}
}
protected:
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& context) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(context.Attr<int>("dtype")),
context.GetPlace());
}
};
class EmptyOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* context) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, context->GetAttr("dtype")));
context->SetOutputDataType("Out", data_type);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
ops::EmptyKernel<plat::CPUDeviceContext, int>,
ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
ops::EmptyKernel<plat::CPUDeviceContext, float>,
ops::EmptyKernel<plat::CPUDeviceContext, double>,
ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/empty_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
ops::EmptyKernel<plat::CUDADeviceContext, int>,
ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
ops::EmptyKernel<plat::CUDADeviceContext, float>,
ops::EmptyKernel<plat::CUDADeviceContext, double>,
ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class EmptyKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto dtype = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
Tensor *out_tensor = context.Output<Tensor>("Out");
auto shape = GetShape(context);
out_tensor->Resize(shape);
out_tensor->mutable_data(context.GetPlace(), dtype);
}
};
} // namespace operators
} // namespace paddle
......@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
template <typename T>
class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_times_tensor")) {
op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
}
if (this->HasInput("ExpandTimes")) {
op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
}
op->SetAttrMap(this->Attrs());
op->SetType("expand");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
} // namespace operators
......@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
template <typename T>
class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("expand_v2");
op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
if (this->HasInput("expand_shapes_tensor")) {
op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
}
if (this->HasInput("Shape")) {
op->SetInput("Shape", this->Input("Shape"));
}
op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
} // namespace operators
......@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
ops::ExpandV2GradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -27,27 +27,6 @@ namespace operators {
using Tensor = framework::Tensor;
inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
std::string op_type) {
// 1. shape is a Tensor
if (ctx.HasInput("ShapeTensor")) {
auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
auto vec_shape = GetDataFromTensor<int>(shape_tensor);
return framework::make_ddim(vec_shape);
}
// 2. shape is a list/tuple containing Tensor
auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
if (shape_tensor_list.size() > 0) {
auto vec_shape = GetDataFromTensorList(shape_tensor_list);
return framework::make_ddim(vec_shape);
}
// 3. shape is a list/tuple without containing Tensor
auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
return framework::make_ddim(vec_shape);
}
template <typename T>
class FillConstantKernel : public framework::OpKernel<T> {
public:
......@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
}
value = tensor_data[0];
}
const std::string op_type = "fill_constant";
auto shape = GetShape(ctx, op_type);
auto shape = GetShape(ctx);
if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>();
......
......@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number);
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
PADDLE_ENFORCE_LT(
ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T));
}
......@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number);
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
PADDLE_ENFORCE_LT(
ids_data[i], row_number,
platform::errors::OutOfRange(
"Value of Ids %d should less than dict size %d.", i, row_number));
PADDLE_ENFORCE_GE(ids_data[i], 0,
platform::errors::OutOfRange(
"Value of Ids %d should greater than ZERO.", i));
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T));
}
......
......@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2,
platform::errors::InvalidArgument(
"Input(X)'s rank must be 2, but received input dim "
"size is:%d, input dim is:[%s]",
x_dims.size(), x_dims));
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
? framework::flatten_to_2d(x_dims, 1)
: x_dims;
PADDLE_ENFORCE_EQ(
x_mat_dims.size(), 2,
platform::errors::InvalidArgument("The size of input X dims should be 2, "
"or 3 with second dimension equal to "
"1, but now Input X dim is:[%s] ",
x_dims));
auto wx_dims = ctx->GetInputDim("WeightX");
PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
......@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"The rank of Input(WeightX) should be 2, but received "
"WeightX dim size is:%d, WeightX dim is:[%s] ",
wx_dims.size(), wx_dims));
PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
PADDLE_ENFORCE_EQ(
wx_dims[0], x_mat_dims[1],
platform::errors::InvalidArgument(
"The first dimension of Input(WeightX) "
"should equal to second dimension of input x, but "
"received WeightX dimension is:%d, x dimension is:%d",
wx_dims[0], x_dims[1]));
"The first dimension of flattened WeightX"
"should equal to last dimension of flattened input X, but "
"received fattened WeightX dimension is:%d, flattened X dimension "
"is:%d",
wx_dims[0], x_mat_dims[1]));
int frame_size = wx_dims[1] / 3;
auto wh_dims = ctx->GetInputDim("WeightH");
......@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"received bias dim is:[%s], frame size is:%d",
b_dims, frame_size));
}
framework::DDim out_dims({x_dims[0], frame_size});
framework::DDim out_dims({x_mat_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->ShareLoD("X", "Hidden");
int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
"fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
"fusion_gru");
OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
"fusion_gru");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
}
......@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() {
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "int8", "bfloat16"});
AddAttr<float>("Scale_data",
"Scale to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Shift_data",
"Shift to be used for int8 input/output data."
"Only used with MKL-DNN INT8.")
.SetDefault(0.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddComment(R"DOC(
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
......@@ -226,8 +251,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto x_lod = x->lod(); \
auto x_dims = x->dims(); /* T x M*/ \
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) \
? framework::flatten_to_2d(x_dims, 1) \
: x_dims; \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_dims[0]; \
const int total_T = x_mat_dims[0]; \
const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \
......@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int M = x_mat_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const jit::gru_attr_t attr( \
......
......@@ -21,11 +21,12 @@ namespace operators {
using paddle::framework::LoDTensor;
using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CreateKey;
using paddle::platform::MKLDNNGetDataType;
using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast;
template <typename T>
template <typename T, typename T_out = T>
class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
public:
GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
......@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
const std::string& unique_name)
: platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(unique_name, Ti)),
CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
N(N),
Ti(Ti),
IC(IC),
......@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// do not depend on Ti size but primitive and input/output memory do
if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
memory_key_ = unique_name;
memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
} else {
memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
platform::ThreadIDasStr());
}
// Is it int8 kernel
const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (is_INT8) {
// Int8 attributes
const float scale_data = ctx.Attr<float>("Scale_data");
const float shift_data = ctx.Attr<float>("Shift_data");
const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
const int weights_scale_mask =
0 +
(1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
+
(1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
attr_.set_rnn_data_qparams(scale_data, shift_data);
attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
}
if (!this->isCached()) {
......@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
platform::errors::Unimplemented(
"oneDNN fusion_gru supports only tanh as an activation."));
// Weights for int8 kernel are of a type s8
const auto weights_dt =
is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
// oneDNN RNN dimensions
const int64_t D = 1; // Directions
const int64_t L = 1; // Layers (PP supports only 1 stacked layer)
......@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// Create memory descriptors
auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any);
auto weight_x_md = MKLDNNMemDesc(
{L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto weight_h_md = MKLDNNMemDesc(
{L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto weight_x_md =
MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto weight_h_md =
MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo);
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
MKLDNNMemoryFormat::any);
auto h0_md = dnnl::memory::desc();
if (h0) {
h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
}
// Create GRU oneDNN primitive
const auto direction =
......@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
: dnnl::rnn_direction::unidirectional_left2right;
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
}
}
......@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
dnnl::memory::format_tag::ntc);
}
void reorderRNNdata(const T* input_data, T* output_data,
void reorderRNNdata(void* input_data, void* output_data,
std::vector<size_t> lod, const bool is_reverse,
platform::RNNReorderType reorder_type) {
switch (reorder_type) {
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case platform::RNNReorderType::PP_NTC: {
auto* input_data_iter = input_data;
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * IC;
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
memcpy(output_data + n * Ti * IC + offset, input_data_iter,
memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
sizeof(T) * num_elements);
input_data_iter += num_elements;
}
} break;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case platform::RNNReorderType::PP_TNC: {
auto* input_data_iter = input_data;
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]);
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data + (t + offset) * N * IC + n * IC,
memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
input_data_iter, sizeof(T) * IC);
input_data_iter += IC;
}
......@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
} break;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case platform::RNNReorderType::NTC_PP: {
auto* output_data_iter = output_data;
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * OC;
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
memcpy(output_data_iter, input_data + n * Ti * OC + offset,
sizeof(T) * num_elements);
memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
sizeof(T_out) * num_elements);
output_data_iter += num_elements;
}
} break;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case platform::RNNReorderType::TNC_PP: {
auto* output_data_iter = output_data;
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = lod[n + 1] - lod[n];
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter,
input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
input_data_iter + (t + offset) * N * OC + n * OC,
sizeof(T_out) * OC);
output_data_iter += OC;
}
}
......@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
}
const auto& input_lod = input->lod()[0];
auto* x_data = input->data<T>();
auto* x_data = to_void_cast(input->data<T>());
auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
auto* x_onednn_data = memory_p->get_data_handle();
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
......@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
return memory_p;
}
// TODO(grygielski) H0 is for now persistable
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
auto* h0_data = to_void_cast(h0->data<T>());
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
this->dev_ctx_.SetBlob(h0_key, memory_p);
auto user_h0_memory = dnnl::memory();
if (h0) {
user_h0_memory =
dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(h0->data<float>()));
} else {
memory_p->set_data_handle(h0_data);
user_h0_memory = dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_);
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
}
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_h0_memory, *memory_p, attr_)
.execute(astream, user_h0_memory, *memory_p);
this->dev_ctx_.SetBlob(h0_key, memory_p);
}
return memory_p;
}
......@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_layer_desc(), this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p)
dnnl::reorder(user_memory, *memory_p, attr_)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wx_key, memory_p);
......@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_iter_desc(), this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p)
dnnl::reorder(user_memory, *memory_p, attr_)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wh_key, memory_p);
......@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std::string memory_key_;
dnnl::primitive_attr attr_;
};
template <typename T>
class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const bool is_INT8 = std::is_same<T, uint8_t>::value;
const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
// TODO(grygielski) Add option for bfloat
if (!is_INT8 || force_fp32_output) {
RunKernel<float>(ctx);
} else {
RunKernel<uint8_t>(ctx);
}
}
template <typename Tout = T>
void RunKernel(const framework::ExecutionContext& ctx) const {
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
......@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
const auto* weight_h = ctx.Input<Tensor>("WeightH");
const auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden = ctx.Output<LoDTensor>("Hidden");
auto x_dims = input->dims();
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
? framework::flatten_to_2d(x_dims, 1)
: x_dims;
// Get attributes
const bool is_reverse = ctx.Attr<bool>("is_reverse");
const bool origin_mode = ctx.Attr<bool>("origin_mode");
// Get tensor dimensions
const auto x_dims = framework::vectorize(input->dims());
const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
const auto weight_h_dims = framework::vectorize(weight_h->dims());
const auto& input_lod = input->lod()[0];
......@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
}
return res;
}();
const int64_t IC = x_dims[1]; // Input channels
const int64_t IC = x_mat_dims_vec[1]; // Input channels
const int64_t OC = weight_h_dims[0]; // Output channels
GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
input, weight_h, h0, is_reverse, N, Ti, IC, OC,
GRUMKLDNNHandler<T, Tout> handler(
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
is_reverse, N, Ti, IC, OC,
ctx.InputName("X") + ctx.InputName("WeightH"));
auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto h0_memory_p = handler.AcquireH0Memory(h0);
auto weight_x_memory_p =
handler.AcquireWeightXMemory(weight_x, origin_mode);
auto weight_h_memory_p =
......@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
std::unordered_map<int, dnnl::memory> gru_args = {
{DNNL_ARG_SRC_LAYER, *input_memory_p},
{DNNL_ARG_SRC_ITER, *h0_memory_p},
{DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
{DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
{DNNL_ARG_BIAS, *bias_memory_p},
{DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
if (h0) {
auto h0_memory_p = handler.AcquireH0Memory(h0);
gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
}
auto gru_forward_p = handler.AcquireForwardPrimitive();
dnnl::stream astream(mkldnn_engine);
gru_forward_p->execute(astream, gru_args);
astream.wait();
auto* hidden_onednn_data =
reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
auto* hidden_data =
to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
if (handler.is_NTC()) {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::NTC_PP);
......@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
ops::FusionGRUMKLDNNKernel<float>);
ops::FusionGRUMKLDNNKernel<float>,
ops::FusionGRUMKLDNNKernel<uint8_t>);
......@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std);
const std::string op_type = "gaussian_random";
auto shape = GetShape(context, op_type);
auto shape = GetShape(context);
tensor->Resize(shape);
int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace());
......
......@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
const std::string op_type = "gaussian_random";
auto shape = GetShape(context, op_type);
auto shape = GetShape(context);
tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace());
......
......@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
private:
std::vector<std::string> in_names_;
std::vector<std::string> out_names_;
paddle::lite::Predictor *engine_;
paddle::lite_api::PaddlePredictor *engine_;
framework::proto::VarType::Type precision_;
bool use_gpu_;
bool zero_copy_;
......@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
framework::LoDTensor src_t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope,
in_names_[i]);
paddle::lite::Tensor *dst_t = engine_->GetInput(i);
paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
<< engine_->GetInputNames()[i] << ")";
inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
}
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(dev_place)) {
......@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
engine_->Run();
VLOG(3) << "lite engine run done";
for (size_t i = 0; i < out_names_.size(); i++) {
paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
framework::LoDTensor *dst_t =
&inference::analysis::GetFromScope<framework::LoDTensor>(
scope, out_names_[i]);
......
......@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
inference::lite::EngineConfig config;
config.valid_places = {
#ifdef PADDLE_WITH_CUDA
paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
#endif
paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
};
serialize_params(&(config.param), &scope, repetitive_params);
config.model = program.Proto()->SerializeAsString();
......
......@@ -9,7 +9,11 @@ function(math_library TARGET)
set(hip_srcs)
set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif()
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
......
......@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
PADDLE_ENFORCE_EQ(
vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input"
" tensor shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
......
......@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
PADDLE_ENFORCE_EQ(
vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(
out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input tensor"
" shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
int blocks = 512;
int grids = (input.numel() + blocks - 1) / blocks;
RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
......@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), size);
PADDLE_ENFORCE_EQ(vector->numel(), size,
platform::errors::InvalidArgument(
"The size of input vector"
" should be equal to the size of input tensor column"
" dimension. Expected vector size=%d, but received %d",
size, vector->numel()));
framework::Tensor one;
one.mutable_data<double>({in_dims[0]}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set;
......@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of input vector"
" should be equal to the size of input tensor row"
" dimension. Expected vector size=%d, but received %d",
in_dims[0], vector->numel()));
framework::Tensor one;
one.mutable_data<double>({size}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set;
......
......@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
framework::Tensor* out) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(out->numel(), size);
PADDLE_ENFORCE_EQ(out->numel(), size,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor column"
" dimension. Expected output size=%d, but received %d",
size, out->numel()));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
......@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
auto& in_dims = input.dims();
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
PADDLE_ENFORCE_EQ(
out->numel(), size,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor column"
" dimension. Expected output size=%d, but received %d",
size, out->numel()));
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
......@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
in_dims[0], out->numel()));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
......@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
PADDLE_ENFORCE_EQ(
out->numel(), height,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
height, out->numel()));
auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
......@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
in_dims[0], out->numel()));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
......@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
"The rank of input tensor "
"should be 2, but received %d",
in_dims.size()));
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
PADDLE_ENFORCE_EQ(
out->numel(), height,
platform::errors::InvalidArgument(
"The size of output tensor "
"should be equal to the size of input tensor row"
" dimension. Expected output size=%d, but received %d",
height, out->numel()));
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
......
......@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
auto* ctx = new paddle::platform::CPUDeviceContext();
paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
paddle::platform::errors::InvalidArgument(
"Each value of input"
"tensor should be 10, but received %d.",
t.data<int>()[i]));
}
delete ctx;
}
......
......@@ -18,7 +18,12 @@
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
const std::vector<float>& data) {
PADDLE_ENFORCE_EQ(size, data.size());
PADDLE_ENFORCE_EQ(
size, data.size(),
paddle::platform::errors::InvalidArgument(
"The size of argument data should"
" be equal to the argument size. Expected %d, but received %d.",
size, data.size()));
for (size_t i = 0; i < data.size(); ++i) {
in_ptr[i] = paddle::platform::float16(data[i]);
}
......
......@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more"
" than 6 dimensions currently."));
}
}
......@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more"
" than 6 dimensions currently."));
}
}
......
......@@ -19,6 +19,8 @@ limitations under the License. */
#include <random>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace math {
......@@ -31,7 +33,10 @@ namespace math {
class Sampler {
public:
explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
// PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
"Range should be"
" greater than 0, but recevied %d.",
range));
if (seed == 0) {
std::random_device r;
seed_ = r();
......
......@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
const framework::SelectedRows& input2,
framework::SelectedRows* output) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height());
PADDLE_ENFORCE_EQ(
in1_height, input2.height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2.height()));
output->set_height(in1_height);
auto& in1_rows = input1.rows();
......@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
PADDLE_ENFORCE_EQ(
in1_row_numel, in2_value.numel() / in2_rows.size(),
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, in2_value.numel() / in2_rows.size()));
PADDLE_ENFORCE_EQ(
in1_row_numel, out_value->numel() / out_rows.size(),
platform::errors::InvalidArgument(
"The input and oupput width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, out_value->numel() / out_rows.size()));
auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto in2_place = input2.place();
PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_cpu_place(out_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>();
......@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height, out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But recieved input height = [%d], output height = [%d]",
in1_height, out_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel, output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, output->numel() / in1_height));
SetConstant<platform::CPUDeviceContext, T> functor;
functor(context, output, 0.0);
......@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
const int64_t input2_offset,
framework::SelectedRows* input2) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
PADDLE_ENFORCE_EQ(
in1_height, input2->height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2->height()));
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
......@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
in2_rows.Extend(in1_rows.begin(), in1_rows.end());
auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto in2_place = input2->place();
PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the CPU place."));
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
......@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
PADDLE_ENFORCE_EQ(in1_height, input2->height(),
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But recieved first input height = [%d], second "
"input height = [%d]",
in1_height, input2->height()));
}
// concat rows
std::vector<int64_t> in2_rows;
......@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
}
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
......@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
platform::errors::InvalidArgument(
"All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
platform::errors::InvalidArgument(
"All inputs should have same height."));
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
......@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
platform::errors::InvalidArgument(
"All inputs should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
platform::errors::InvalidArgument(
"All input should have same height."));
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
......@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
......
......@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
const framework::SelectedRows& input2,
framework::SelectedRows* output) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2.height());
PADDLE_ENFORCE_EQ(
in1_height, input2.height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2.height()));
output->set_height(in1_height);
framework::Vector<int64_t> in1_rows(input1.rows());
......@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto& in2_value = input2.value();
auto in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
PADDLE_ENFORCE_EQ(
in1_row_numel, in2_value.numel() / in2_rows.size(),
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, in2_value.numel() / in2_rows.size()));
PADDLE_ENFORCE_EQ(
in1_row_numel, out_value->numel() / out_rows.size(),
platform::errors::InvalidArgument(
"The input and oupput width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, out_value->numel() / out_rows.size()));
auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto in2_place = input2.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto out_place = context.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
......@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But recieved first input height = [%d], first input height = [%d]",
in1_height, in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height, out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But recieved input height = [%d], output height = [%d]",
in1_height, out_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel, output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But recieved input width = [%d], output width = [%d]",
in1_row_numel, output->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>();
......@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
const int64_t input2_offset,
framework::SelectedRows* input2) {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
PADDLE_ENFORCE_EQ(
in1_height, input2->height(),
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, input2->height()));
auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows());
......@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
}
auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto in2_place = input2->place();
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
platform::errors::InvalidArgument(
"The running enviroment is not on the GPU place."));
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
......@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>();
......@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
platform::errors::InvalidArgument(
"All input should have same "
"dimension except for the first one."));
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
platform::errors::InvalidArgument(
"All input should have same height."));
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
......@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
auto in1_height = merged_in1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
PADDLE_ENFORCE_EQ(
in1_height, in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But recieved first input height = "
"[%d], second input height = [%d]",
in1_height, in2_dims[0]));
auto& in1_value = merged_in1.value();
auto& in1_rows = merged_in1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
PADDLE_ENFORCE_EQ(
in1_row_numel, input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But recieved first input width = [%d], second input width = [%d]",
in1_row_numel, input2->numel() / in1_height));
auto* in1_data = in1_value.template data<T>();
auto* in2_data = input2->data<T>();
......
......@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0);
PADDLE_ENFORCE(cudaDeviceSynchronize());
PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
paddle::platform::errors::PreconditionNotMet(
"The all synchronization on the cuda is error!"));
std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
......
......@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col->dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol.dims().size(), 4,
platform::errors::InvalidArgument("The dimension of"
" vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(
col->dims().size(), 7,
platform::errors::InvalidArgument("The dimension of"
"col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
......@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
const T* vol_data = vol.data<T>();
T* col_data = col->data<T>();
......@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col.dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol->dims().size(), 4,
platform::errors::InvalidArgument("The dimension of vol"
" should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(
col.dims().size(), 7,
platform::errors::InvalidArgument("The dimension of col"
" should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
......@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
1;
PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d)"
" and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
1;
PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d)"
" and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
1;
PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d)"
" and output_width(%d) are mismatching.",
input_width_tmp, output_width));
T* vol_data = vol->data<T>();
const T* col_data = col.data<T>();
......
......@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col->dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol.dims().size(), 4,
platform::errors::InvalidArgument("The dimension of"
" vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(
col->dims().size(), 7,
platform::errors::InvalidArgument("The dimension of"
"col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
......@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
1;
PADDLE_ENFORCE_EQ(
input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
1;
PADDLE_ENFORCE_EQ(
input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp, output_width));
int num_outputs =
input_channels * output_depth * output_height * output_width;
......@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
"The dimension of vol should be 4.");
PADDLE_ENFORCE_EQ(col.dims().size(), 7,
"The dimension of col should be 7.");
PADDLE_ENFORCE_EQ(
vol->dims().size(), 4,
platform::errors::InvalidArgument("The dimension of vol"
" should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(
col.dims().size(), 7,
platform::errors::InvalidArgument("The dimension of col"
" should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
......@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
1;
PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
platform::errors::InvalidArgument(
"input_depth(%d)"
" and output_depth(%d) are mismatching.",
input_depth_tmp, output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
1;
PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
platform::errors::InvalidArgument(
"input_height(%d)"
" and output_height(%d) are mismatching.",
input_height_tmp, output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
1;
PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
platform::errors::InvalidArgument(
"input_width(%d)"
" and output_width(%d) are mismatching.",
input_width_tmp, output_width));
int num_kernels = input_channels * input_depth * input_height * input_width;
......
......@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
const std::string op_type = "gaussian_random";
auto shape = GetShape(context, op_type);
auto shape = GetShape(context);
tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace());
int64_t size = tensor->numel();
......
......@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("lars_weight_decay",
"(float, default 0.0005) LARS weight decay")
.SetDefault(0.0005);
AddAttr<float>("epsilon",
"(float, default 0.0) epsilon to avoid Division by Zero.")
.SetDefault(0.0);
AddComment(R"DOC(
Lars Momentum Optimizer.
......
......@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu,
const int64_t num, const T lars_coeff,
const T lars_weight_decay, const T* p_norm,
const T* g_norm, T* p_out, T* v_out) {
const T* g_norm, T* p_out, T* v_out,
const T epsilon) {
T lr = learning_rate[0];
T local_lr = learning_rate[0];
CUDA_KERNEL_LOOP(i, num) {
if (p_norm[0] > 0 && g_norm[0] > 0) {
if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]);
(g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
}
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
v_out[i] = v_new;
p_out[i] = p[i] - v_new;
......@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto* p = param->data<T>();
auto* v = velocity->data<T>();
......@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
eg_norm.device(*place) = eigen_g.square().sum().sqrt();
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
p_norm_data, g_norm_data, p_out, v_out);
p_norm_data, g_norm_data, p_out, v_out, epsilon);
}
};
......
......@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
......@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt();
T local_lr = lr[0];
if (ep_norm(0) > 0 && eg_norm(0) > 0) {
if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0));
(eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
}
v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out;
......
include(operators)
if(WITH_GPU)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
register_operators(DEPS cub)
else()
register_operators()
endif()
else()
register_operators()
endif()
......@@ -24,5 +28,9 @@ if(WITH_GPU)
endif()
if(WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
else()
nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
endif()
endif()
......@@ -13,18 +13,138 @@
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
#include <memory>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
namespace paddle {
namespace operators {
class LogsumexpOpMaker : public ops::ReduceOpMaker {
protected:
virtual std::string GetName() const { return "logsumexp"; }
virtual std::string GetOpType() const { return "Reduce logsumexp"; }
class LogsumexpOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 4,
platform::errors::InvalidArgument(
"The input tensor X's dimensions of logsumexp "
"should be less equal than 4. But received X's "
"dimensions = %d, X's shape = [%s].",
x_rank, x_dims));
auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
PADDLE_ENFORCE_GT(
axis.size(), 0,
platform::errors::InvalidArgument(
"The size of axis of logsumexp "
"should be greater than 0. But received the size of axis "
"of logsumexp is %d.",
axis.size()));
for (size_t i = 0; i < axis.size(); i++) {
PADDLE_ENFORCE_LT(
axis[i], x_rank,
platform::errors::InvalidArgument(
"axis[%d] should be in the "
"range [-dimension(X), dimension(X)] "
"where dimesion(X) is %d. But received axis[i] = %d.",
i, x_rank, axis[i]));
PADDLE_ENFORCE_GE(
axis[i], -x_rank,
platform::errors::InvalidArgument(
"axis[%d] should be in the "
"range [-dimension(X), dimension(X)] "
"where dimesion(X) is %d. But received axis[i] = %d.",
i, x_rank, axis[i]));
if (axis[i] < 0) {
axis[i] += x_rank;
}
}
bool keepdim = ctx->Attrs().Get<bool>("keepdim");
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
auto dims_vector = vectorize(x_dims);
if (reduce_all) {
if (keepdim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keepdim) {
for (size_t i = 0; i < axis.size(); ++i) {
dims_vector[axis[i]] = 1;
}
} else {
const int kDelFlag = -1;
for (size_t i = 0; i < axis.size(); ++i) {
dims_vector[axis[i]] = kDelFlag;
}
dims_vector.erase(
std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
}
if (!keepdim && dims_vector.size() == 0) {
dims_vector.push_back(1);
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (axis.size() > 0 && axis[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
}
};
class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input tensor. Tensors with rank at most 4 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<std::vector<int>>(
"axis",
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault({0});
AddAttr<bool>("keepdim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
.SetDefault(false);
AddAttr<bool>("reduce_all",
"(bool, default false) "
"If true, output a scalar reduced along all dimensions.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
logsumexp Operator.
This operator computes the logsumexp of input tensor along the given axis.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
)DOC"));
}
};
class LogsumexpGrapOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
"Out@GRAD", "logsumexp");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
template <typename T>
......@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("logsumexp_grad");
op->SetInput("X", this->Input("X"));
......@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
namespace ops = paddle::operators;
REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);
REGISTER_OP_CPU_KERNEL(logsumexp,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
float, ops::LogsumexpFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
double, ops::LogsumexpFunctor>);
REGISTER_OP_CPU_KERNEL(
logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::LogsumexpGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
ops::LogsumexpGradFunctor>);
logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
logsumexp_grad,
ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
REGISTER_OP_CUDA_KERNEL(logsumexp,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::LogsumexpFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::LogsumexpFunctor>);
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -15,8 +15,9 @@
// .part used to speed up nvcc compile
#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::LogsumexpGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::LogsumexpGradFunctor>);
logsumexp_grad,
ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -27,9 +27,6 @@ class RunProgramOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
platform::errors::NotFound(
"Input(X) of RunProgramOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
platform::errors::NotFound(
"Input(Params) of RunProgramOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
platform::errors::NotFound(
"Output(Out) of RunProgramOp should not be null."));
......@@ -73,7 +70,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<LoDTensor or SelecetedRows>)"
"The input parameter of RunProgram operator, also the parameters "
"of the loaded program.")
.AsDuplicable();
.AsDuplicable()
.AsDispensable();
AddOutput("Out",
"(vector<LoDTensor>)"
"The output tensors of RunProgram operator, also the fetch "
......@@ -121,10 +119,6 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
platform::errors::NotFound(
"Input(X) of RunProgramGradOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInputs("Params"), true,
platform::errors::NotFound(
"Input(Params) of RunProgramGradOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInputs(framework::GradVarName("Out")), true,
platform::errors::NotFound(
......
......@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
auto output_vars = ctx.MultiOutputVar("Out");
auto input_var_names = ctx.InputNames("X");
auto param_names = ctx.InputNames("Params");
auto output_var_names = ctx.OutputNames("Out");
// current program may not hold parameters
std::vector<std::string> param_names;
if (!param_vars.empty()) {
param_names = ctx.InputNames("Params");
}
auto *block = ctx.Attr<BlockDesc *>("global_block");
auto *program = block->Program();
auto start_op_index = ctx.Attr<int64_t>("start_op_index");
......
此差异已折叠。
......@@ -81,5 +81,26 @@ inline std::vector<T> GetDataFromTensorList(
}
return vec_new_data;
}
inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
// 1. shape is a Tensor
if (ctx.HasInput("ShapeTensor")) {
auto* shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
auto vec_shape = GetDataFromTensor<int>(shape_tensor);
return framework::make_ddim(vec_shape);
}
// 2. shape is a list/tuple containing Tensor
auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
if (shape_tensor_list.size() > 0) {
auto vec_shape = GetDataFromTensorList(shape_tensor_list);
return framework::make_ddim(vec_shape);
}
// 3. shape is a list/tuple without containing Tensor
auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
return framework::make_ddim(vec_shape);
}
} // namespace operators
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册