diff --git a/CMakeLists.txt b/CMakeLists.txt index 996a79fbbc3005680205e9fc0442b6bc6199bebb..9cfec8e70b4a3d166e3b45048408d7f5e45ce6e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -315,7 +315,6 @@ endif() if (ON_INFER) message(STATUS "On inference mode, will take place some specific optimization.") - add_definitions(-DPADDLE_ON_INFERENCE) else() #TODO(luotao), combine this warning with `make inference_lib_dist` command. message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 7f5771e561f6cc419fc9b3094174645ac432546e..4e17ddee73958106d5e2c8c8ea5661acc758518a 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -218,3 +218,7 @@ endif(WITH_GRPC) if(WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA) endif(WITH_BRPC_RDMA) + +if(ON_INFER) + add_definitions(-DPADDLE_ON_INFERENCE) +endif(ON_INFER) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index c384456b648d4497bf4bd003b183b773186e0f15..e8e53f988f92685cd4854b21202bcf7f9b1a4383 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, } void NaiveExecutor::Run() { +#ifndef PADDLE_ON_INFERENCE + LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the " + "cmake flag ON_INFER is not set."; + LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and " + "variables will be reused to save the allocation " + "overhead."; + LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by " + "setting the cmake flag ON_INFER=ON if you are " + "running Paddle Inference"; +#endif // PADDLE_ON_INFERENCE for (auto &op : ops_) { VLOG(3) << std::this_thread::get_id() << " run " << op->Type() << " on scope " << scope_; diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index c59b232191c49ccb47bb9f51dcaf2fd9280fae19..ac0330218973123771367ed5ba9477c90143a043 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -63,6 +63,8 @@ struct OpKernelType { place_(dev_ctx.GetPlace()), library_type_(library_type) {} + size_t hash_key() const { return Hash()(*this); } + bool operator==(const OpKernelType& o) const { return platform::places_are_same_class(place_, o.place_) && data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6bd744edc22e6a90ce64e9d699e7f3c5c60d4908..2b35943d092518c7f45a8ed3b708532666a23353 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false, namespace paddle { namespace framework { +// Combine two hash values to a single hash. +inline size_t CombineHash(size_t seed, size_t a) { + return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + std::vector> kKernelPriority = { std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), @@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack( Scope* OperatorWithKernel::TryTransferData( const Scope& scope, const OpKernelType& expected_kernel_key, std::vector* transfered_inplace_vars) const { +// In the inference scenerio, the scopes will be reused across the batches, so +// the `new_scope` here will result in GPU memroy explosion over the running of +// operators. +// We use a thread_local cache to fix that issue, the key in the cache is the +// combination of the `scope` argument, from_kernel_type, target_kernel_type. +// Have a discussion with @Superjomn or the inference developers if some changes +// on this logic for this macro might not tested on the other scenerios. +#ifdef PADDLE_ON_INFERENCE + thread_local std::unordered_map infer_transfer_scope_cache; +#endif + Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { for (auto& var_name : var_name_item.second) { @@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData( VLOG(30) << "Transform Variable " << var_name << " from " << kernel_type_for_var << " to " << expected_kernel_key; +#ifdef PADDLE_ON_INFERENCE + size_t infer_cache_key = + CombineHash(OpKernelType::Hash()(kernel_type_for_var), + OpKernelType::Hash()(expected_kernel_key)); + infer_cache_key = + CombineHash(infer_cache_key, std::hash()(&scope)); + + auto it = infer_transfer_scope_cache.find(infer_cache_key); + if (it != infer_transfer_scope_cache.end()) { + new_scope = infer_transfer_scope_cache[infer_cache_key]; + } else { + new_scope = &scope.NewScope(); + infer_transfer_scope_cache[infer_cache_key] = new_scope; + } +#endif + if (new_scope == nullptr) { new_scope = &scope.NewScope(); } auto* trans_var = new_scope->Var(var_name); + Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); SetTensorToVariable(*var, out, trans_var); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index bbeef150254f8f7a1f382a5b81055a6a5589eee1..26cb7d51a88afac15322eecad965912097d19a45 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -42,7 +42,7 @@ DEFINE_double( // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. -#ifdef ON_INFER +#ifdef PADDLE_ON_INFERENCE #define SCOPE_LOCK_GUARD #else #define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_);