diff --git a/README.md b/README.md index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..63abca069a6629ac59739224ded9cd9f06207d0a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) -[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9c42044ec163e9db1dd21d5c3915b010c30fdf1c..fd7fc16bff5651f022b484623243048fbd225b5a 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID) set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") endif(NOT APPLE AND NOT ANDROID) +set_property(GLOBAL PROPERTY FLUID_MODULES "") +# find all fluid modules is used for paddle fluid static library +# for building inference libs +function(find_fluid_modules TARGET_NAME) + get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(FIND "${__target_path}" "fluid" pos) + if(pos GREATER 1) + get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) + set(fluid_modules ${fluid_modules} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}") + endif() +endfunction(find_fluid_modules) + function(merge_static_libs TARGET_NAME) set(libs ${ARGN}) list(REMOVE_DUPLICATES libs) @@ -250,6 +264,7 @@ function(cc_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (${cc_test_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) endif() endif() endfunction(cc_test) @@ -314,6 +329,7 @@ function(nv_test TARGET_NAME) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) endif() endif() endfunction(nv_test) @@ -561,7 +577,7 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 850098297e1456487cb8a7b83dffd3d2b0478689..0c720faa353438b76a72e1574cb90931ddd0cf73 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -12,19 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -set_property(GLOBAL PROPERTY FLUID_MODULES "") -# find all fluid modules is used for paddle fluid static library -function(find_fluid_modules TARGET_NAME) - get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) - string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) - string(FIND "${__target_path}" "fluid" pos) - if(pos GREATER 1) - get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) - set(fluid_modules ${fluid_modules} ${TARGET_NAME}) - set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}") - endif() -endfunction(find_fluid_modules) - # make package for paddle fluid shared and static library function(copy TARGET) set(options "") @@ -163,9 +150,9 @@ if(WITH_CONTRIB) list(APPEND inference_deps contrib_anakin_inference_lib) endif() - copy(contrib_inference_lib DEPS paddle_inference_api + copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h - ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.* + ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api* DSTS ${contrib_dst_dir} ${contrib_dst_dir}) list(APPEND inference_deps contrib_inference_lib) endif() diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md index 967317d5d2eeb818ab14faabca342cc8c4ed717e..4d2aab87b8cf30d03075e96cc4c67070efaf963a 100644 --- a/doc/fluid/design/multi_devices/kernel_selection.md +++ b/doc/fluid/design/multi_devices/kernel_selection.md @@ -74,10 +74,10 @@ void OperatorWithKernel::Run( auto kernel_type_for_var = this->GetKernelTypeForVar(...); if (kernel_type_for_var.place_ != expected_kernel_key.place_) { auto* trans_var = new_scope.Var(var_name); - auto* out = DataTransform(expected_kernel_key, + auto* out = TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in); - CopyVariableWithTensor(...); + SetTensorToVariable(...); } } diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index 2cd6ab2bbf042bced41957193a0269f477eb10d0..a8bbb4eb8081420ae0bbaf761bd27303c0d043cb 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -46,6 +46,10 @@ cc_library(paddle_inference_api SRCS paddle_inference_api.cc paddle_inference_api_impl.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) +cc_library(paddle_inference_api_shared SHARED + SRCS paddle_inference_api.cc paddle_inference_api_impl.cc + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) + cc_test(test_paddle_inference_api SRCS test_paddle_inference_api.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index bc48fd3b479157d4aea390cd5f4dc61ea46dca4b..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, "Input tensor type is not supported: ", in.type().name()); memory::data_type out_type = in_type; - auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format()); + auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); auto out_format = - MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); void* in_data = GetDataFromTensor(in, in_type); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 67f91e4e48d3e11ed493c5e6943cb9071aff60c4..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { return MKLDNNDataType::data_undef; } -inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size, - MKLDNNFormat default_format) { - return (dims_size == 1 - ? mkldnn::memory::format::x - : dims_size == 2 ? mkldnn::memory::format::nc : default_format); -} #endif void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 5f15e20c78fd5a333523fe9e73542c037a161cae..82872224501709080ff02a13464d58543a0abda8 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -18,17 +18,21 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_type_transform.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace framework { -static void PassTensorData(Tensor* from, Tensor* to) { +static void PassTensorData(Tensor *from, Tensor *to) { to->ShareDataWith(*from); *from = Tensor(); } -void DataTransform(const OpKernelType& expected_kernel_type, - const OpKernelType& kernel_type_for_var, - const Tensor& input_tensor, Tensor* output_tensor) { +void TransformData(const OpKernelType &expected_kernel_type, + const OpKernelType &kernel_type_for_var, + const Tensor &input_tensor, Tensor *output_tensor) { bool transformed = false; Tensor in; in.ShareDataWith(input_tensor); @@ -48,8 +52,8 @@ void DataTransform(const OpKernelType& expected_kernel_type, // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur - auto out_format = - MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin)); + auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), + ToMKLDNNFormat(lin)); out.ShareDataWith(input_tensor); out.set_layout(DataLayout::kMKLDNN); @@ -89,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type, output_tensor->ShareDataWith(in); } -void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, - Variable* out_var) { +void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, + Variable *out_var) { if (in_var.IsType()) { - auto& in_lod_tensor = in_var.Get(); - auto* tran_lod_tensor = out_var->GetMutable(); + auto &in_lod_tensor = in_var.Get(); + auto *tran_lod_tensor = out_var->GetMutable(); tran_lod_tensor->set_lod(in_lod_tensor.lod()); tran_lod_tensor->set_layout(in_lod_tensor.layout()); tran_lod_tensor->ShareDataWith(tensor); } else if (in_var.IsType()) { - auto& in_selected_rows = in_var.Get(); - auto* trans_selected_rows = out_var->GetMutable(); + auto &in_selected_rows = in_var.Get(); + auto *trans_selected_rows = out_var->GetMutable(); trans_selected_rows->set_height(in_selected_rows.height()); trans_selected_rows->set_rows(in_selected_rows.rows()); trans_selected_rows->mutable_value()->ShareDataWith(tensor); diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h index dee5d8c7c1126013742460df1d94bb364220ad09..ae3ab051bda2e698801cc6fe6e3ddddf039f5385 100644 --- a/paddle/fluid/framework/data_transform.h +++ b/paddle/fluid/framework/data_transform.h @@ -30,12 +30,15 @@ limitations under the License. */ namespace paddle { namespace framework { -void DataTransform(const OpKernelType& expected_kernel_type, - const OpKernelType& kernel_type_for_var, - const Tensor& input_tensor, Tensor* out); - -void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, - Variable* out_var); +void TransformData(const OpKernelType &expected_kernel_type, + const OpKernelType &kernel_type_for_var, + const Tensor &input_tensor, Tensor *out); + +/** + * Set OutVar from InVar, except the tensor is shared with `tensor` + */ +void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, + Variable *out_var); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index f51a184e7bae2283f335fe9462a77b9c5fb831a5..c59b232191c49ccb47bb9f51dcaf2fd9280fae19 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { return ret; } -inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { +inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) { return (!platform::places_are_same_class(l.place_, r.place_)) || (l.data_type_ != r.data_type_) || NeedTransformLayout(l.data_layout_, r.data_layout_); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c1329b06d7e9bcd6604fed14cefa305339c5c4b8..aa1a42fc97310e4c149f2018f1706d8e42630a98 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -620,8 +620,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, "There are no kernels which are registered in the %s operator.", type_); } - ExecutionContext ctx(*this, scope, *dev_ctx); - OpKernelMap& kernels = kernels_iter->second; // TODO(dzhwinter) : kernel fallback mechanism will be added when all the @@ -631,7 +629,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // Do selection // } - auto expected_kernel_key = this->GetExpectedKernelType(ctx); + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -640,56 +639,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } - // do data transform - Scope& new_scope = scope.NewScope(); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + auto* transfer_scope = + TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - std::vector inplace_vars; - for (auto& var_name_item : this->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope.FindVar(var_name); - if (var && VarIsTensor(var)) { - auto* tensor_in = GetTensorFromVar(var); - if (tensor_in->IsInitialized()) { - auto kernel_type_for_var = this->GetKernelTypeForVar( - var_name_item.first, *tensor_in, expected_kernel_key); - if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) { - auto out_var_names = OutputVars(true); - if (std::find(out_var_names.begin(), out_var_names.end(), - var_name) != out_var_names.end()) { - inplace_vars.push_back(var_name); - } - VLOG(3) << "Transform Variable " << var_name << " from " - << kernel_type_for_var << " to " << expected_kernel_key; - auto* trans_var = new_scope.Var(var_name); - std::shared_ptr out(new Tensor); - DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in, - out.get()); - CopyVariableWithTensor(*var, *(out.get()), trans_var); - } - } - } - } + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = + (transfer_scope == nullptr ? scope : *transfer_scope); + + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); } - auto* new_dev_ctx = pool.Get(expected_kernel_key.place_); - kernel_iter->second->Compute( - ExecutionContext(*this, new_scope, *new_dev_ctx)); + kernel_iter->second->Compute(ExecutionContext(*this, exec_scope, *dev_ctx)); - for (auto& var_name : inplace_vars) { - VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; - auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); - auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name)); - original_tensor->ShareDataWith(*transformed_tensor); + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); } /*For profiling/benchmark only*/ if (FLAGS_benchmark) { - new_dev_ctx->Wait(); + dev_ctx->Wait(); } if (FLAGS_check_nan_inf) { for (auto& vname : OutputVars(true)) { - auto* var = new_scope.FindVar(vname); + auto* var = exec_scope.FindVar(vname); if (var == nullptr) continue; if (var->IsType()) { CheckTensorNANOrInf(vname, var->Get()); @@ -697,6 +674,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } +void OperatorWithKernel::TransferInplaceVarsBack( + const Scope& scope, const std::vector& inplace_vars, + const Scope& transfer_scope) const { + for (auto& var_name : inplace_vars) { + VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; + auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); + auto* transformed_tensor = + GetTensorFromVar(transfer_scope.FindVar(var_name)); + original_tensor->ShareDataWith(*transformed_tensor); + } +} + +Scope* OperatorWithKernel::TryTransferData( + const Scope& scope, const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars) const { + Scope* new_scope = nullptr; + for (auto& var_name_item : Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope.FindVar(var_name); + // Only tensor can be tranfer to another device. + if (var == nullptr || !VarIsTensor(var)) { + continue; + } + + auto* tensor_in = GetTensorFromVar(var); + if (!tensor_in->IsInitialized()) { + continue; + } + + auto kernel_type_for_var = GetKernelTypeForVar( + var_name_item.first, *tensor_in, expected_kernel_key); + + if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { + continue; + } + + auto out_var_names = OutputVars(true); + if (std::find(out_var_names.begin(), out_var_names.end(), var_name) != + out_var_names.end()) { + transfered_inplace_vars->emplace_back(var_name); + } + + VLOG(3) << "Transform Variable " << var_name << " from " + << kernel_type_for_var << " to " << expected_kernel_key; + + if (new_scope == nullptr) { + new_scope = &scope.NewScope(); + } + + auto* trans_var = new_scope->Var(var_name); + Tensor out; + TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); + SetTensorToVariable(*var, out, trans_var); + } + } + + return new_scope; +} proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39..1550d5df172f0599e1b42e7f1ccf51ac4dd1e0c3 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase { // same. proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; void RunImpl(const Scope& scope, const platform::Place& place) const final; + + /** + * Transfer data from scope to a transfered scope. If there is no data need to + * be tranfered, it returns nullptr. + * + * * transfered_inplace_vars is a output vector. + */ + Scope* TryTransferData( + const Scope& scope, const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars) const; + + void TransferInplaceVarsBack(const Scope& scope, + const std::vector& inplace_vars, + const Scope& exec_scope) const; }; extern bool OpSupportGPU(const std::string& op_type); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 751b10eeeed10828c08ada4173300c07f81c093e..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( t->set_lod(lod_tensors[j].lod()); } } - for (auto &p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } } ParallelExecutor::~ParallelExecutor() { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index e5bc74755f46449296a153e8b330968e6d9f1e1d..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto stream = reinterpret_cast(ctx).stream(); - memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); + if (platform::is_same_place(src_place, dst_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + if (platform::is_same_place(ctx_place, src_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); + } else if (platform::is_same_place(ctx_place, dst_place)) { + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); + } + } } #endif } @@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(src.place())) { - dev_ctx = pool.Get(src.place()); - } else { + if (platform::is_gpu_place(dst_place)) { dev_ctx = pool.Get(dst_place); + } else { + dev_ctx = pool.Get(src.place()); } TensorCopy(src, dst_place, *dev_ctx, dst); } diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -23,10 +23,25 @@ limitations under the License. */ namespace paddle { namespace framework { +// NOTE(zcd): Because TensorCopy is an async operation, when the src_place +// and dst_place are two different GPU, to ensure that the operation can +// be carried out correctly, there is a src_ctx wait operation in TensorCopy. +// If ctx_place and src_place are the same, src_ctx.Wait() is added +// after memory::Copy; if ctx_place and dst_place are the same, +// src_ctx.Wait() is added before memory::Copy. void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst); + +// NOTE(zcd): If the src.place() and dst_place are two different GPU, +// the copy operation is carried out on the dst_place's stream. This is +// very important, because TensorCopy is an async operator, and in most +// case, once this copy operator returns, dst is to be used in dst_place's +// stream, if this copy operation is carried out on the src_place's stream, +// when dst is used in dst_place's stream the copy operation may be +// not completed. void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst); + void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst); diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4c5de189cd1eab1ba3de0b2cdfd2294d139ceab2 --- /dev/null +++ b/paddle/fluid/inference/analysis/README.md @@ -0,0 +1,57 @@ +# Inference Analysis + +The `inference/analysis` module is used to analyze and optimize the inference program, +it references some philosophy from `LLVM/analysis`, +and make the various optimization features be pluggable and co-exist in a pipeline. + +We borrowed some concepts from LLVM, such as + +- [Pass](./pass.h)es to implement optimization that traverse the inference program, +- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program, +- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph. + +There are some other basic concepts here + +- [Node](./node.h), the node in a `DataFlowGraph`, + - `Function`, the Operator in Fluid, + - `Value`, the Variable in Fluid; +- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline, + +## How it works + +The `inference/analysis` module make all the passes in a pipeline, and works in such way: + +1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc, +2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes, +3. Transform a new ProgramDesc from the modified `DataFlowGraph`. + +The new optimization features can be added as an independent `Pass` and controlled by gflags, +each pass will generate unified debug information or visualization for better debugging. + +## Supported Passes + +### `FluidToDataFlowGraphPass` +Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, +this should be the first pass of the pipeline. + +### `DataFlowGraphToFluidPass` +Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline. + +### `TensorRTSubgraphNodeMarkPass` +Mark the `Node` that are supported by TensorRT, +this pass will generate a visualization file which can be used for debugging. + +### `TensorRTSubGraphPass` +Split the sub-graph that are can be accelerated by TensorRT. + +### `DFG_GraphvizDrawPass` +This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool. + +It can be used as a helper class that draws the modified graph after each pass. + +## Utilities + +There is some helper function/class for analysis. + +- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes, +- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes. diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 5d85530969c5bec1c84d5f5b0d2626431a9e1c63..a4625f008c15300b88ef0bce71cd7d8aa473c9a8 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/analyzer.h" +#include #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" @@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) { } // namespace analysis } // namespace inference -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index f290a3777d5be2ef64667d8c17ec59adddc3ef1b..e9e14fb1947da059c8d126d3da182ce446f6421e 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + /* * This file contains Analyzer, an class that exposed as a library that analyze * and optimize diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 30c60661f3492034248e164a70a682bae3819d23..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -138,7 +138,7 @@ struct GraphTraits { // sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph. static std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph) { +ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT std::unordered_set nodes(graph.begin(), graph.end()); std::unordered_set inputs; std::unordered_set outputs; diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index e74efd17b834db1d0314c8b7082f3e9c15d6eda3..29ca008123addf07959b965a4b54bf55b18c401d 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" +#include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/proto_desc.h" @@ -150,13 +151,14 @@ namespace { class DFG_DebuggerPass : public DFG_GraphvizDrawPass { public: using Config = DFG_GraphvizDrawPass::Config; - DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {} + explicit DFG_DebuggerPass(const Config& config) + : DFG_GraphvizDrawPass(config) {} std::string repr() const override { return "dfg-to-fluid-debuger-pass"; } bool Finalize() override { return true; } }; -} +} // namespace Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h index 1726e056ed37e2e5fbe2042851ca9bd188806bac..edc84b02ed20991e3e7c6c437d2b1fac169bae03 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h @@ -19,6 +19,7 @@ #pragma once +#include #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/pass.h" diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h index b064782586f6243353eda67ac8db040509716b20..17445ab4407a159ca11345bc9a9226b3ad0044f0 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h @@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass { const bool display_deleted_node; }; - DFG_GraphvizDrawPass(const Config &config) : config_(config) {} + explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {} bool Initialize(Argument *argument) override { return true; } void Run(DataFlowGraph *graph) override; diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 5d7eb43b7cbd7bc45b5f0c940bf80ad72348e1b9..e918622d74cfb11d83090555be2a768cc14e7742 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "analyzer.h" +#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" @@ -88,7 +88,8 @@ namespace { class DFG_DebuggerPass : public DFG_GraphvizDrawPass { public: using Config = DFG_GraphvizDrawPass::Config; - DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {} + explicit DFG_DebuggerPass(const Config &config) + : DFG_GraphvizDrawPass(config) {} std::string repr() const override { return "fluid-to-dfg-debuger-pass"; } bool Finalize() override { return true; } }; diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc index 6caba8f04237e014c5ddf1a3a077bcbadb0ddb71..dac1c509d728114bd24a2ea1150c407646026fd4 100644 --- a/paddle/fluid/inference/analysis/pass_manager_tester.cc +++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc @@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/inference/analysis/pass_manager.h" +#include + #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/ut_helper.h" -#include - namespace paddle { namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc index 5ad092a9ed201e5e6ab7770bcfd9ddf871779c12..f736e385c11add152dc9ab9485bf1de40f80b2f3 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" +#include + #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/node_attr_flags.h" +#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" namespace paddle { namespace inference { @@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) { class DfgDebuggerPass : public DFG_GraphvizDrawPass { public: - DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config) + explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config) : DFG_GraphvizDrawPass(config) {} std::string repr() const override { diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 0c74f62de5c6f5d432ee928945db6dcf385ca209..bd98ed81899440a46415d30b6d74fec2dac4c155 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -20,6 +20,12 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" +DEFINE_bool(init_allocated_mem, false, + "It is a mistake that the values of the memory allocated by " + "BuddyAllocator are always zeroed in some op's implementation. " + "To find this error in time, we use init_allocated_mem to indicate " + "that initializing the allocated memory with a small value " + "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { @@ -41,6 +47,9 @@ template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } VLOG(10) << " pointer=" << p; return p; } @@ -104,6 +113,9 @@ void* Alloc(platform::CUDAPlace place, size_t size) { LOG(WARNING) << "GPU memory used: " << Used(place); platform::SetDeviceId(cur_dev); } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } return ptr; } @@ -137,6 +149,9 @@ void* Alloc(platform::CUDAPinnedPlace place, LOG(WARNING) << "cudaMallocHost Cannot allocate " << size << " bytes in CUDAPinnedPlace"; } + if (FLAGS_init_allocated_mem) { + memset(ptr, 0xEF, size); + } return ptr; } diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 6ecb43c49c30f9da2a273d506f7b85c0a4f5fa2c..9ab2179b5fe689762704039c5f67dd080e530aa5 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor - auto src_memory = - memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, - to_void_cast(x_data)); + mkldnn::memory::format input_format = + platform::MKLDNNFormatForSize(src_tz.size(), x->format()); + + auto src_memory = memory( + {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, + to_void_cast(x_data)); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; @@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { using bn_bwd_types = bn_type_traits; // create mkldnn memory from input diff_y tensor - auto user_diff_dst_memory = - memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, - mkldnn_engine}, - to_void_cast(diff_y_data)); + + mkldnn::memory::format dst_format = + platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); + + auto user_diff_dst_memory = memory( + {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, + to_void_cast(diff_y_data)); // create mkldnn memory from input x tensor - auto src_memory = - memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, - to_void_cast(x_data)); + mkldnn::memory::format input_format = + platform::MKLDNNFormatForSize(src_tz.size(), x->format()); + + auto src_memory = memory( + {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, + to_void_cast(x_data)); // for diff_dst, try to use same format as dst in forward pass auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index ed99932546446eb877c9701de15e2d37d29b5f88..a6cccc31219104767ac38bdebeb1d4c0e8c2ac01 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -228,7 +228,7 @@ class MKLDNNHandler { return dstr; }; return dims2str(operand_dims) + suffix; - }; + } protected: const MKLDNNDeviceContext& dev_ctx_; @@ -237,5 +237,15 @@ class MKLDNNHandler { bool is_reusing_; }; +inline mkldnn::memory::format MKLDNNFormatForSize( + size_t dims_size, mkldnn::memory::format data_format) { + if (dims_size == 1) { + return mkldnn::memory::format::x; + } else if (dims_size == 2) { + return mkldnn::memory::format::nc; + } + return data_format; +} + } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 45af83708ea63fc1b6aa86f1e8423bb44b7388a6..3034c1a0875a71421bcba172c16ee32d809df152 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -118,7 +118,8 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb' + 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', + 'init_allocated_mem' ] if core.is_compiled_with_cuda(): read_env_flags += [ diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 21f2037ad408b0a92718c0ea2bae5e8bf563c665..cddf00765f4894126988c794763c34629449e8e6 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -18,6 +18,8 @@ import unittest import paddle.fluid as fluid import time import numpy as np +import math +import sys __all__ = ['TestParallelExecutorBase'] @@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase): print "%.4f Instance per second" % ( (batch_size * iter + 2) / (end - begin)) + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + print first_loss, last_loss # self.assertGreater(first_loss[0], last_loss[0]) return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 252793944462244539084a288e5259f216359650..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -16,6 +16,8 @@ import paddle.fluid as fluid import numpy as np import unittest import os +import sys +import math def simple_fc_net(): @@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): train_loss, = train_exe.run([loss.name], feed=feed_dict) + avg_test_loss_val = np.array(test_loss).mean() + if math.isnan(float(avg_test_loss_val)): + sys.exit("got NaN loss, testing failed.") + + avg_train_loss_val = np.array(train_loss).mean() + if math.isnan(float(avg_train_loss_val)): + sys.exit("got NaN loss, training failed.") + self.assertTrue( np.allclose( train_loss, test_loss, atol=1e-8),