diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ec632e20690eafdc558e24f160270a89b29ee41..e4442d254901e2524385452ebe5ac6f6df3056f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,7 +212,7 @@ endif() if (WITH_JEMALLOC) find_package(JeMalloc REQUIRED) include_directories(${JEMALLOC_INCLUDE_DIR}) - add_definitions(-DWITH_JEMALLOC) + add_definitions(-DPADDLE_WITH_JEMALLOC) endif() include(generic) # simplify cmake module @@ -276,9 +276,3 @@ add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) endif() - -if(WITH_DOC) - find_package(Sphinx REQUIRED) - find_python_module(recommonmark REQUIRED) - add_subdirectory(doc) -endif() diff --git a/Dockerfile b/Dockerfile index acfd091265e26d6c29c561d166fed2504c0cff1c..fe0721e9b99b5e028df2f6228ff04cb56a567a3f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub # ENV variables ARG WITH_GPU ARG WITH_AVX -ARG WITH_DOC ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} -ENV WITH_DOC=${WITH_DOC:-OFF} ENV HOME /root # Add bash enhancements diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake deleted file mode 100644 index f74cd4ff8c9c2c52319b18ac37264167b3718eae..0000000000000000000000000000000000000000 --- a/cmake/FindSphinx.cmake +++ /dev/null @@ -1,147 +0,0 @@ -# - This module looks for Sphinx -# Find the Sphinx documentation generator -# -# This modules defines -# SPHINX_EXECUTABLE -# SPHINX_FOUND - -find_program(SPHINX_EXECUTABLE - NAMES sphinx-build - PATHS - /usr/bin - /usr/local/bin - /opt/local/bin - DOC "Sphinx documentation generator" -) - -if( NOT SPHINX_EXECUTABLE ) - set(_Python_VERSIONS - 2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5 - ) - - foreach( _version ${_Python_VERSIONS} ) - set( _sphinx_NAMES sphinx-build-${_version} ) - - find_program( SPHINX_EXECUTABLE - NAMES ${_sphinx_NAMES} - PATHS - /usr/bin - /usr/local/bin - /opt/loca/bin - DOC "Sphinx documentation generator" - ) - endforeach() -endif() - -include(FindPackageHandleStandardArgs) - -find_package_handle_standard_args(Sphinx DEFAULT_MSG - SPHINX_EXECUTABLE -) - - -option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON ) -option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF ) -option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF ) -option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF ) -option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF ) -option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF ) -option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF ) -option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF ) -option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF ) - - -mark_as_advanced( - SPHINX_EXECUTABLE - SPHINX_HTML_OUTPUT - SPHINX_DIRHTML_OUTPUT - SPHINX_HTMLHELP_OUTPUT - SPHINX_QTHELP_OUTPUT - SPHINX_DEVHELP_OUTPUT - SPHINX_EPUB_OUTPUT - SPHINX_LATEX_OUTPUT - SPHINX_MAN_OUTPUT - SPHINX_TEXT_OUTPUT -) - -function( Sphinx_add_target target_name builder conf cache source destination ) - add_custom_target( ${target_name} ALL - COMMAND ${SPHINX_EXECUTABLE} -b ${builder} - -d ${cache} - -c ${conf} - ${source} - ${destination} - COMMENT "Generating sphinx documentation: ${builder}" - COMMAND cd ${destination} && ln -sf ./index_*.html index.html - ) - - set_property( - DIRECTORY APPEND PROPERTY - ADDITIONAL_MAKE_CLEAN_FILES - ${destination} - ) -endfunction() - -# Target dependencies can be optionally listed at the end. -function( Sphinx_add_targets target_base_name conf source base_destination ) - - set( _dependencies ) - - foreach( arg IN LISTS ARGN ) - set( _dependencies ${_dependencies} ${arg} ) - endforeach() - - if( ${SPHINX_HTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html ) - - add_dependencies( ${target_base_name}_html ${_dependencies} ) - endif() - - if( ${SPHINX_DIRHTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml ) - - add_dependencies( ${target_base_name}_dirhtml ${_dependencies} ) - endif() - - if( ${SPHINX_QTHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp ) - - add_dependencies( ${target_base_name}_qthelp ${_dependencies} ) - endif() - - if( ${SPHINX_DEVHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp ) - - add_dependencies( ${target_base_name}_devhelp ${_dependencies} ) - endif() - - if( ${SPHINX_EPUB_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub ) - - add_dependencies( ${target_base_name}_epub ${_dependencies} ) - endif() - - if( ${SPHINX_LATEX_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex ) - - add_dependencies( ${target_base_name}_latex ${_dependencies} ) - endif() - - if( ${SPHINX_MAN_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man ) - - add_dependencies( ${target_base_name}_man ${_dependencies} ) - endif() - - if( ${SPHINX_TEXT_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text ) - - add_dependencies( ${target_base_name}_text ${_dependencies} ) - endif() - - if( ${BUILD_TESTING} ) - sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck ) - - add_dependencies( ${target_base_name}_linkcheck ${_dependencies} ) - endif() -endfunction() diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 252c32ba63bf93671837afe34e84cfc1624473a5..7b47da35f68ea7fa2ce4657282428a1e355509e9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,4 +1,3 @@ - #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -207,3 +206,24 @@ endif (NOT WIN32) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) + +# Get the current working branch +execute_process( + COMMAND git rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Get the latest abbreviated commit hash of the working branch +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +message(STATUS "commit: ${PADDLE_COMMIT}") +message(STATUS "branch: ${PADDLE_BRANCH}") + +configure_file(commit.h.in commit.h) diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in new file mode 100644 index 0000000000000000000000000000000000000000..3a33ece624443a99083ae29abb70254a5ac40a3d --- /dev/null +++ b/paddle/fluid/framework/commit.h.in @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace paddle { +namespace framework { + +static std::string paddle_commit() { + return "@PADDLE_COMMIT@"; +} + +static std::string paddle_compile_branch() { + return "@PADDLE_BRANCH@"; +} + +static std::string paddle_version() { + return "@PADDLE_VERSION@"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 2ee12cc410393d1e1aa5fc9e5374d858eca1b901..929d9edc34ffb92f468d5b7af54a0b8da4121543 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include #include namespace paddle { @@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector &source) { } std::unordered_set visited; - std::unordered_set to_visit{source.begin(), source.end()}; + std::set to_visit{source.begin(), source.end()}; std::vector inlink_visited; while (!to_visit.empty()) { diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc deleted file mode 100644 index 7e174c7def1ffa4089a94d9cc504b18843557c53..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ngraph_operator.cc +++ /dev/null @@ -1,545 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include -#include - -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/framework/var_type.h" - -#include "ngraph/ngraph.hpp" - -namespace paddle { -namespace framework { - -static ngraph::Shape Ddim2Shape(const DDim& dims) { - ngraph::Shape sp; - for (int i = 0; i < dims.size(); ++i) { - int k = dims[i]; - k = k == 0 ? 1 : k; - sp.push_back(k); - } - return sp; -} - -static std::map pd2ng_type_map = { - {proto::VarType::FP32, ngraph::element::f32}, - {proto::VarType::FP64, ngraph::element::f64}, - {proto::VarType::INT32, ngraph::element::i32}, - {proto::VarType::INT64, ngraph::element::i64}, - {proto::VarType::BOOL, ngraph::element::boolean}, -}; - -typedef enum { /* nGraph support state on ops */ - FULL_TRAIN, /* Support full ops for train */ - PARTIAL_TRAIN, /* Support partial ops for train */ - FULL_TEST, /* Support full list of ops for test */ - PARTIAL_TEST /* Support partial list of ops for test */ -} op_state; - -// perform graph build through bridge and execute computation -class NgraphEngine { - public: - explicit NgraphEngine(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) - : scope_(scope), - place_(place), - fused_ops_(ops), - var_type_map_(var_type_map), - persistables_(persist), - fetches_(fetches), - post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) { - var_in_node_map_ = std::make_shared< - std::unordered_map>>(); - - var_node_map_ = std::make_shared< - std::unordered_map>>(); - - BuildNgIO(); - - GetNgFunction(); - } - - void Run(const Scope& scope, const platform::Place& place) const; - - private: - static std::unordered_map> - func_cache_; - const Scope& scope_; - const platform::Place& place_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - op_state ng_op_state_; - - // ngraph backend eg. CPU - static std::shared_ptr backend_; - // ngraph function to call and execute - std::shared_ptr ngraph_function_; - // var_name of inputs - std::vector var_in_; - // var_name of outputs from fetch in order - std::vector var_out_; - // map input vars to nodes - std::shared_ptr< - std::unordered_map>> - var_in_node_map_; - // map each var name with a ngraph node - std::shared_ptr< - std::unordered_map>> - var_node_map_; - // cache key to check if function is cached - std::shared_ptr GetCacheKey(); - // get ngraph input and define ngraph input parameters - void GetNgInputShape(std::shared_ptr op); - // Call ngraph bridge to map ops - void BuildNgNodes(); - // get the ngraph input and output var list - void BuildNgIO(); - // build ngraph function call - void BuildNgFunction(); - // Check cache for ngraph function or otherwise build the function - void GetNgFunction(); -}; - -std::vector>::iterator>> -NgraphOperator::NgraphOpIntervals( - std::vector>* ops) { - std::vector>::iterator>> - intervals; - if (ops->empty()) { - return intervals; - } - size_t size = ops->size(); - size_t left = 0; - while (left < size && ops->at(left)->Type() != kFeedOpType) { - ++left; - } - if (left == size) { - return intervals; - } - while (left < size && ops->at(left)->Type() == kFeedOpType) { - ++left; - } - - size_t right = left; - while (right < size && ops->at(right)->Type() != kFetchOpType) { - ++right; - } - if (right == size) { - return intervals; - } - if (left >= right) return intervals; - - // (left, right - 1) represents indices between feed and fetch - size_t pivot = left; - while (pivot < right) { - auto op_type = ops->at(pivot)->Type(); - if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == - paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { - ++pivot; - } else { - size_t start = pivot, end = start; - while (pivot < right && - (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops->at(pivot)->Type()) != - paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { - ++pivot; - ++end; - } - std::vector>::iterator> - interval = {ops->begin() + start, ops->begin() + end}; - intervals.push_back(interval); - } - } // end while - - return intervals; -} - -NgraphOperator::NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), - pdesc_(prog), - block_(block_id) { - for (std::vector>::iterator it = start; - it != end; ++it) { - fused_ops_.push_back(std::move(*it)); - } - - for (std::vector>::iterator it = end; - (*it)->Type() != kFetchOpType; ++it) { - for (auto& var_name_item : (*it)->Inputs()) { - for (auto& var_name : var_name_item.second) { - post_op_inputs_.insert(var_name); - } - } - } - - if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_full_ = true; - } - - Process(); -} - -void NgraphOperator::Process() { - auto& bdesc = pdesc_.Block(block_); - for (auto& var : bdesc.AllVars()) { - if (!(var->GetType() == proto::VarType::SELECTED_ROWS || - var->GetType() == proto::VarType::LOD_TENSOR || - var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { - continue; - } - - auto var_name = var->Name(); - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var_name != "fetch" && var_name != "feed") { - auto pd_type = var->GetDataType(); - if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { - PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", - var_name); - } - var_type_map_[var_name] = pd2ng_type_map[pd_type]; - } - - if (var->Persistable()) { - persistables_.insert(var->Name()); - } - } - - for (auto* op : bdesc.AllOps()) { - if (op->Type() == kFetchOpType) { - std::string fetch_target_name = op->Input("X")[0]; - fetches_.insert(fetch_target_name); - } - } -} - -void NgraphOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { - op_state ng_op_state = PARTIAL_TEST; - auto& bdesc = pdesc_.Block(block_); - for (auto* op : bdesc.AllOps()) { - if (op->Type().find("_grad") != std::string::npos) { - ng_op_state = PARTIAL_TRAIN; - break; - } - } - - if (is_full_) { - ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; - } - - NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_engine.Run(scope, place); -} - -std::unordered_map> - NgraphEngine::func_cache_ = {}; - -std::shared_ptr NgraphEngine::backend_ = - ngraph::runtime::Backend::create("CPU"); - -void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); - op->RuntimeInferShape(scope_, place_, ctx); - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto sp = Ddim2Shape(tensor_pd->dims()); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, sp, true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; - } - } - } - } - } -} - -void NgraphEngine::BuildNgNodes() { - for (auto& var_name : var_out_) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - auto ng_shape = Ddim2Shape(ddim); - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, ng_shape, true); - (*var_node_map_)[var_name] = prm; - } - } - } - - paddle::framework::NgraphBridge ngb(var_node_map_); - for (auto& op : fused_ops_) { - ngb.BuildNgNode(op); - } -} - -void NgraphEngine::BuildNgIO() { - std::unordered_set inputs; - std::unordered_set outputs; - - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - inputs.insert(var_name); - const bool is_output = outputs.find(var_name) != outputs.end(); - if (!is_output && - std::find(var_in_.begin(), var_in_.end(), var_name) == - var_in_.end()) { - // fill var_in here to keep lhs and rhs order - var_in_.push_back(var_name); - } - } - } - - if (op->Type() != "fill_constant") { - GetNgInputShape(op); - } - - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - outputs.insert(var_name); - } - } - } - - // var_out.clear(); - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - switch (ng_op_state_) { - case PARTIAL_TEST: - if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TEST: - if (fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case PARTIAL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - post_op_inputs_.find(var_name) != post_op_inputs_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - default: - var_out_.push_back(var_name); - } - } - } - } -} - -void NgraphEngine::BuildNgFunction() { - BuildNgNodes(); - ngraph_function_ = nullptr; - ngraph::NodeVector func_outputs; - ngraph::ParameterVector func_inputs; - - for (auto& vo : var_out_) { - func_outputs.push_back(var_node_map_->at(vo)); - } - - for (auto& vi : var_in_) { - std::shared_ptr prm = - std::dynamic_pointer_cast( - var_in_node_map_->at(vi)); - func_inputs.push_back(prm); - } - - ngraph_function_ = - std::make_shared(func_outputs, func_inputs); -} - -std::shared_ptr NgraphEngine::GetCacheKey() { - auto cache_key = std::make_shared(""); - *cache_key += std::to_string(fused_ops_.size()); - for (auto& op : fused_ops_) { - *cache_key += op->Type(); - } - for (auto& var_name : var_in_) { - auto shape = var_node_map_->at(var_name)->get_shape(); - *cache_key += var_name; - *cache_key += var_type_map_.at(var_name).c_type_string(); - for (size_t i = 0; i < shape.size(); ++i) { - *cache_key += std::to_string(shape.at(i)); - } - } - - for (auto& var_name : var_out_) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - for (int i = 0; i < ddim.size(); ++i) { - *cache_key += std::to_string(ddim[i]); - } - } - } - return cache_key; -} - -void NgraphEngine::GetNgFunction() { - bool cache_on = true; - if (cache_on) { - std::string cache_key_val = *GetCacheKey(); - if (func_cache_.find(cache_key_val) != func_cache_.end()) { - ngraph_function_ = func_cache_.at(cache_key_val); - } else { - BuildNgFunction(); - func_cache_[cache_key_val] = ngraph_function_; - } - } else { - BuildNgFunction(); - } -} - -void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { - std::vector> t_in; - std::vector> t_out; - - for (size_t i = 0; i < var_in_.size(); ++i) { - auto vi = var_in_.at(i); - auto sp = var_node_map_->at(vi)->get_shape(); - std::shared_ptr ti; - auto* var = scope.FindVar(vi); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type() == proto::VarType::FP32) { - const float* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT32) { - const int* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT64) { - const int64_t* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::FP64) { - const double* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::BOOL) { - const bool* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::boolean, sp, - const_cast(arr)); - } else { - PADDLE_THROW("Data type not handling for var %s", vi); - } - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", vi); - } - bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) - ? true - : false; - bool is_persistable = - (persistables_.find(vi) != persistables_.end()) ? true : false; - if (is_test && is_persistable) { - ti->set_stale(false); - } - t_in.push_back(ti); - } - - for (size_t i = 0; i < var_out_.size(); ++i) { - auto var_name = var_out_[i]; - auto* var = scope.FindVar(var_name); - std::shared_ptr to; - if (var && var->IsType()) { - auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - auto dd = tensor_pd->dims(); - ngraph::Shape sp = Ddim2Shape(dd); - auto ng_type = var_type_map_.at(var_name); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); - } else { - PADDLE_THROW("Data type not handled in for var %s", var_name); - } - t_out.push_back(to); - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); - } - } - - backend_->call(backend_->compile(ngraph_function_), t_out, t_in); -} // NgraphEngine::RunImpl -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ee9f6a480542845beffdb26767ce1b1578118725..9d6c10ab9e33d0e9888fa484030be9da7752512e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -555,18 +555,17 @@ Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { - auto names = op().Outputs(name); + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) { + return {}; + } + const std::vector& vars = it->second; std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> Tensor* { - auto var = scope_.FindVar(sub_name); - if (var == nullptr) return nullptr; - PADDLE_ENFORCE( - var->IsType(), - "%s should be LoDTensor, but the received type is %s", - sub_name, ToTypeName(var->Type())); - return var->GetMutable(); + res.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(res), + [&](Variable* var) -> Tensor* { + return var == nullptr ? nullptr + : var->GetMutable(); }); return res; } @@ -1073,7 +1072,9 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - int data_type = -1; + proto::VarType::Type dafault_data_type = + static_cast(-1); + proto::VarType::Type data_type = dafault_data_type; for (auto& input : this->inputs_) { const std::vector vars = ctx.MultiInputVar(input.first); for (size_t i = 0; i < vars.size(); ++i) { @@ -1090,18 +1091,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( if (t != nullptr) { PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", input.first, i); - int tmp = static_cast(t->type()); + proto::VarType::Type tmp = t->type(); PADDLE_ENFORCE( - tmp == data_type || data_type == -1, + tmp == data_type || data_type == dafault_data_type, "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", - Type(), data_type, tmp); + Type(), DataTypeToString(data_type), DataTypeToString(tmp)); data_type = tmp; } } } } - PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); - return static_cast(data_type); + PADDLE_ENFORCE(data_type != dafault_data_type, + "DataType should be indicated by input"); + return data_type; } OpKernelType OperatorWithKernel::GetExpectedKernelType( diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index ce3ad18b1fb1c6304eaa60173e6dfad5e9dafb2d..ef5404e4755817cefc925acbf4882ff86d1f0ba3 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -25,7 +25,8 @@ inline const T* Tensor::data() const { check_memory_size(); bool valid = std::is_same::value || type_ == DataTypeTrait::DataType; - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", + DataTypeToString(type_)); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 8029129b9a6a9fcbc0ff10daa1f25b210259e9d8..83fc6ee2e299f5fa18d5cc6f220c0be6a66e709d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -204,59 +204,68 @@ framework::LoDTensor& VarBase::GradValue() { } std::map> OpBase::ApplyGrad() { - if (!grad_op_desc_ && backward_id_ <= 0) { + if (grad_op_descs_.empty() && backward_id_ <= 0) { LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; } - std::map> grad_outputs; + std::vector grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( - backward_id_, - grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]); + grad_outputs.resize(1); + grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + PyLayer::ApplyGrad( + backward_id_, + grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - for (size_t i = 0; i < it.second.size(); ++i) { - // Allocate a new variable - Variable* tmp_var = new framework::Variable(); - tmp_var->GetMutable(); - outputs.push_back(tmp_var); + grad_outputs.resize(grad_op_descs_.size()); + for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + framework::OpDesc* grad_op_desc = grad_op_descs_[k]; + VLOG(3) << "op grad " << grad_op_desc->Type(); + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + outputs.push_back(tmp_var); + } } - } - framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); + grad_op_desc->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - framework::Scope scope; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); - p.op.RuntimeInferShape(scope, place_, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + framework::Scope scope; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); + p.op.RuntimeInferShape(scope, place_, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + } } - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - auto& origin_outputs = it.second; - PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); - - for (size_t i = 0; i < outputs.size(); ++i) { - framework::Variable* grad = outputs[i]; - framework::Variable* orig_grad = origin_outputs[i]; - AddTo(grad, orig_grad, place_); - delete grad; + for (size_t k = 0; k < grad_output_vars_.size(); ++k) { + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + auto& origin_outputs = it.second; + PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); + + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* grad = outputs[i]; + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(grad, orig_grad, place_); + delete grad; + } } } + return input_vars_; } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 633924aa417b8bd64bf4921054f82fdb7f7868fe..dc97433a5102b39d03ea5cac3157c027f9d67c98 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -184,12 +184,13 @@ class OpBase { OpBase() : op_desc_(nullptr), forward_id_(-1), - grad_op_desc_(nullptr), backward_id_(-1), place_(platform::CPUPlace()) {} virtual ~OpBase() { - if (grad_op_desc_) delete grad_op_desc_; + for (framework::OpDesc* desc : grad_op_descs_) { + delete desc; + } } std::map> ApplyGrad(); @@ -198,9 +199,11 @@ class OpBase { // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. framework::OpDesc* op_desc_; int forward_id_; - // When has backward, one of `grad_op_desc_` or `backward_id_` is set, + + // When has backward, one of `grad_op_descs_` or `backward_id_` is set, // not both. - framework::OpDesc* grad_op_desc_; + // Note: each fwd op corresponds to a vector of bwd ops. + std::vector grad_op_descs_; int backward_id_; platform::Place place_; @@ -210,8 +213,11 @@ class OpBase { OpBasePtrMap pre_ops_; std::map> pre_ops_out_idx_; - framework::VariableValueMap grad_input_vars_; - framework::VariableValueMap grad_output_vars_; + // Inputs to a vector of bwd ops. + std::vector grad_input_vars_; + // Outputs to a vector of bwd ops. + std::vector grad_output_vars_; + framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 5b87839f457c24d5d6687a27faac6c0f52f5f90b..cd62807a5532e6b2309cb5a8f679c3097b51c9e9 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -24,15 +24,16 @@ namespace imperative { void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, + std::vector* grad_op_descs, std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = + PADDLE_ENFORCE(grad_op_descs->empty()); + std::vector> descs = framework::OpInfoMap::Instance() .Get(op_desc.Type()) .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); + for (auto& desc : descs) { + grad_op_descs->emplace_back(desc.release()); + } } void InitVar(framework::Variable* var, framework::Variable* grad_var, @@ -138,49 +139,52 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx)); if (!stop_gradient) { - framework::OpDesc* grad_op_desc; - // TODO(panyx): Is this leaked? std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); - op->grad_op_desc_ = grad_op_desc; - - for (auto it : grad_op_desc->Inputs()) { - auto& grad_in_vars = op->grad_input_vars_[it.first]; - for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); - auto var_it = grad_to_var->find(grad_invar); - if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); - // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); - } else { + CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + + op->grad_input_vars_.resize(op->grad_op_descs_.size()); + op->grad_output_vars_.resize(op->grad_op_descs_.size()); + for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + // Forward inputs or outputs. + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->var_->IsInitialized()) { + InitVar(var->var_, var->grads_->var_, + prepared_op.GetDeviceContext()); + } + // Douts. + grad_in_vars.push_back(var->grads_->var_); + } + } + } + + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[i][it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end(), + "Could not found the grad op output var, should this " + "operator %s's stop gradient be True", + op_desc->Type()); VarBase* var = vars[var_it->second]; if (!var->grads_->var_->IsInitialized()) { InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext()); } - // Douts. - grad_in_vars.push_back(var->grads_->var_); - } - } - } - - for (auto it : grad_op_desc->Outputs()) { - auto& grad_out_vars = op->grad_output_vars_[it.first]; - for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); - auto var_it = grad_to_var->find(grad_outvar); - PADDLE_ENFORCE(var_it != grad_to_var->end(), - "Could not found the grad op output var, should this " - "operator %s's stop gradient be True", - op_desc->Type()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext()); + grad_out_vars.push_back(var->grads_->var_); } - grad_out_vars.push_back(var->grads_->var_); } } } @@ -209,10 +213,12 @@ std::vector Tracer::PyTrace(OpBase* op, out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } if (!stop_gradient) { + op->grad_input_vars_.resize(1); + op->grad_output_vars_.resize(1); auto& grad_input_vars = - op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]; + op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]; auto& grad_output_vars = - op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)]; + op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)]; for (const VarBase* inp : inputs) { grad_input_vars.push_back(inp->var_); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 71c4a54dea08d9d5e53f182949854981fe36a41a..a2546ead93c3baeb8029f6451d8a60dcc75f8571 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/platform/variant.h" namespace paddle { @@ -130,6 +131,8 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, + contrib::AnalysisConfig::Precision); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index ca40c01fc57dbcc2ca16770a1b7d798de8b5625b..4f5c50d0d6b9ac94130cb82fb342ae5ee592f2c0 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -36,6 +36,14 @@ void SetAttr(framework::proto::OpDesc *op, const std::string &name, attr->set_i(data); } template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const bool &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(data); +} +template <> void SetAttr(framework::proto::OpDesc *op, const std::string &name, const int64_t &data) { auto *attr = op->add_attrs(); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index de04713b531dc421b885473cc8956e8ba6b63574..120f6ef27d49ae59ec36304dc3742cd9ca0afa4b 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include #include @@ -29,9 +30,14 @@ limitations under the License. */ #include "paddle/fluid/platform/port.h" #ifdef _WIN32 +#include +#include #define GCC_ATTRIBUTE(attr__) ; +#define MKDIR(path) _mkdir(path) #else +#include #define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) #endif #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) @@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) { return false; } +static std::string GetDirRoot(const std::string &path) { + char sep = '/'; + +#ifdef _WIN32 + sep = '\\'; +#endif + + size_t i = path.rfind(sep, path.length()); + if (i != std::string::npos) { + return (path.substr(0, i)); + } + return path; +} + +static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) { + std::string opt_cache_dir = model_root + "/_opt_cache/"; + if (!PathExists(opt_cache_dir)) { + PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1, + "Can not create optimize cache directory: %s, Make sure you " + "have permission to write", + opt_cache_dir); + } + return opt_cache_dir; +} + +static std::string GetTrtCalibPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_calib_" + engine_key; +} + +// If there is no calib table data file in model_opt_cache_dir, return "". +static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, + const std::string &engine_key, + bool enable_int8) { + std::string trt_calib_table_path = + GetTrtCalibPath(model_opt_cache_dir, engine_key); + if (enable_int8 && FileExists(trt_calib_table_path)) { + VLOG(3) << "Calibration table file: " << trt_calib_table_path + << "is found here"; + std::ifstream infile(trt_calib_table_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string calibration_data(buffer.str()); + return calibration_data; + } + return ""; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4e1464226450b833e6d8dae2be2dcad89dd1e5e4..99611ce84b23896dd173831a03d77c6e0252d998 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + + bool enable_int8 = argument->tensorrt_precision_mode() == + contrib::AnalysisConfig::Precision::kInt8; + + pass->Set("enable_int8", new bool(enable_int8)); + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } // graph_ = pass->Apply(std::move(graph_)); @@ -91,11 +105,14 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { } framework::proto::ProgramDesc IRPassManager::AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const { + std::unique_ptr *graph, ProgramDesc *program) const { auto pass = framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); - ProgramDesc desc(program); + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + ProgramDesc desc; + desc.CopyFrom(*program->Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); *graph = pass->Apply(std::unique_ptr(the_graph)); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 983a582649706fa6eedb5aa459b5ac53b98f658b..2a595cb36b8345157b3fd26afc62aabfa98b87bc 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -29,6 +29,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -42,8 +43,8 @@ class IRPassManager final { std::unique_ptr Apply(std::unique_ptr graph); - framework::proto::ProgramDesc AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const; + framework::proto::ProgramDesc AcquireProgram(std::unique_ptr *graph, + ProgramDesc *program) const; framework::ir::Graph &graph() const { return *graph_; } diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 5f25303cc1eaa6b563f0f8f4289b38499eb487cc..69a9caec030600332c9f11ba255e4e642bd41e96 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -67,12 +68,33 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( return graph; } +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + // An fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); @@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, subgraph.size()); for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto(); } - // collect inputs - std::unordered_set input_names; - std::unordered_set input_names_with_id; + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); - std::unordered_set output_names; - std::unordered_set output_names_with_id; + std::set output_names; + std::set output_names_with_id; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // to Tensor. std::vector output_mapping; for (auto name : output_names) { - // LOG(INFO) << name << " " << output_name_map.size(); PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } @@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, *vars->Add() = *node->Var()->Proto(); } } + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); - // Set attrs + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); + // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + + auto enable_int8 = Get("enable_int8"); + auto engine_key = + GenerateEngineKey(input_names_with_id, output_names_with_id); + + std::string calibration_data = GetTrtCalibTableData( + Get("model_opt_cache_dir"), engine_key, enable_int8); + SetAttr(op_desc->Proto(), "calibration_data", calibration_data); + + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "engine_key", engine_key); } std::vector ExtractParameters( diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index f1da37af3cc5fa55eb66a1822aefe96eda1dc4fb..6b3d80fcef0be1527062edbb37ea39cc5d95a168 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { } std::unique_ptr graph(argument->main_graph_ptr()); - framework::ProgramDesc desc(argument->main_program()); + + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + framework::ProgramDesc desc; + desc.CopyFrom(*argument->main_program().Proto()); pass->SetNotOwned("program", &desc); auto thegraph = pass->Apply(std::move(graph)); thegraph.release(); // the argument still own the graph. diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index e6008ba335ed89222247fc00033d1afbd6b28f16..8efd514bd8397f099fd07321ad7e5d4ca253e229 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -102,6 +102,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); + CP_MEMBER(tensorrt_precision_mode_); // MKLDNN releated. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -141,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() { Update(); } -void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size, - int min_subgraph_size) { +void contrib::AnalysisConfig::EnableTensorRtEngine( + int workspace_size, int max_batch_size, int min_subgraph_size, + contrib::AnalysisConfig::Precision precision_mode) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -154,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; + tensorrt_precision_mode_ = precision_mode; Update(); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9f8a78f7abc37d17b9806ea766da132f9bf4b28d..66374cb7f07b3d9b6bfbff8382a3dfa7e8f2b04f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -37,6 +39,8 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" + #endif DECLARE_bool(profile); @@ -44,6 +48,12 @@ DECLARE_bool(profile); namespace paddle { using contrib::AnalysisConfig; +using inference::Singleton; +#if PADDLE_WITH_TENSORRT +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; +#endif namespace { bool IsPersistable(const framework::VarDesc *var) { @@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram( if (!program) { if (!LoadProgramDesc()) return false; + // If not cloned, the parameters should be loaded. + // If config_.ir_optim() is True, parameters is loaded in + // OptimizeInferenceProgram(), but other persistable variables + // (like RAW type var) are not created in scope. + // If config_.ir_optim() is False, parameters is loaded in LoadParameters(), + // still need to create other persistable variables. + // So in both case, create persistable variables at first. + executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); + // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. @@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram( status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { - // If the parent_scope is passed, we assert that the persistable variables - // are already created, so just create the no persistable variables. - - // If not cloned, the parameters should be loaded - // OptimizeInferenceProgram. - // So in both cases, just the local variables are needed to load, not the - // parematers. - executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); - // Load parameters LOG(INFO) << "load parameters "; LoadParameters(); @@ -339,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); + std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); + argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } @@ -349,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); + argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); } if (config_.use_mkldnn_) { @@ -363,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); - argument_.SetScopeNotOwned(const_cast(scope_.get())); + argument_.SetScopeNotOwned(scope_.get()); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); @@ -569,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() { return true; } +#if PADDLE_WITH_TENSORRT +bool AnalysisPredictor::SaveTrtCalibToDisk() { + PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), + "This func can be invoked only in trt mode"); + auto &block = inference_program_->Block(0); + for (auto &op_desc : block.AllOps()) { + if (op_desc->Type() == "tensorrt_engine") { + std::string engine_name = + boost::get(op_desc->GetAttr("engine_key")); + if (!Singleton::Global().Has(engine_name)) { + LOG(ERROR) << "You should run the predictor(with trt) on the real data " + "to generate calibration info"; + return false; + } + TRTCalibratorEngine *calib_engine = + Singleton::Global().Get(engine_name); + LOG(INFO) << "Wait for calib threads done."; + calib_engine->calib_->waitAndSetDone(); + LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot " + "of time..."; + calib_engine->thr_->join(); + std::string calibration_table_data = + calib_engine->calib_->getCalibrationTableAsString(); + + if (calibration_table_data.empty()) { + LOG(ERROR) << "the calibration table is empty."; + return false; + } + + std::string model_opt_cache_dir = + argument_.Has("model_dir") + ? argument_.model_dir() + : inference::analysis::GetDirRoot(argument_.model_program_path()); + + std::string calibration_table_data_path = + inference::analysis::GetTrtCalibPath( + inference::analysis::GetOrCreateModelOptCacheDir( + model_opt_cache_dir), + engine_name); + + std::ofstream ofile(calibration_table_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " + << calibration_table_data_path; + ofile << calibration_table_data; + ofile.close(); + } + } + // Free all calibrator resources. + Singleton::Global().DeleteALL(); + return true; +} +#endif + AnalysisPredictor::~AnalysisPredictor() { +#if PADDLE_WITH_TENSORRT + if (config_.tensorrt_engine_enabled() && + config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && + Singleton::Global().Has()) { + SaveTrtCalibToDisk(); + } +#endif if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); @@ -653,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { return need; } +std::string AnalysisPredictor::GetSeriazlizedProgram() const { + return inference_program_->Proto()->SerializeAsString(); +} + template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index a8ea67d4bd332b5614f4f6593e8397829d28c5a6..fa1d0d596df5a3619af74e0fead3a0b376186e08 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); + std::string GetSeriazlizedProgram() const override; + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); @@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor { void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); +#if PADDLE_WITH_TENSORRT + // When we use Paddle-TRT INT8 engine, we need to generate calibration table + // data first, + // the calibration table contains the range for each op's input and output, + // this whole process can be divided into several steps: + // + // 1. Builds a 32-bit engine, runs it on the calibration set, and records a + // histogram for each + // tensor of the distribution of activation values. + // 2. Builds a calibration table from the histograms. + // + // After step 2, we need to store the calibration table on disk + bool SaveTrtCalibToDisk(); +#endif + // Some more detailed tests, they are made the friends of the predictor, so that // the all the details can be tested. #if PADDLE_WITH_TESTING diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 4688e93d7102109d2c7ece9ba37bc8f2d311dcf1..20b61344da978a87baf654efd4ad2b3ae90454c0 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) { { // The first predictor help to cache the memory optimize strategy. auto predictor = CreatePaddlePredictor(config); + LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram(); + ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty()); // Run several times to check the parameters are not reused by mistake. for (int i = 0; i < 5; i++) { diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6..6cd18277d63200f5bccf180a7ae3196b0ce126ff 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/commit.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -97,4 +99,12 @@ void PaddleBuf::Free() { } } +std::string get_version() { + std::stringstream ss; + ss << "version: " << framework::paddle_version() << "\n"; + ss << "commit: " << framework::paddle_commit() << "\n"; + ss << "branch: " << framework::paddle_compile_branch() << "\n"; + return ss.str(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc index 7a579610eefda24c911edd28b5f3a178aa10ab1e..2c450ef7cead4d5c3870d5e9186eb221e5dc19a0 100644 --- a/paddle/fluid/inference/api/api_tester.cc +++ b/paddle/fluid/inference/api/api_tester.cc @@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) { predictor->Run({}, &outputs); } +TEST(paddle_inference_api, get_version) { + LOG(INFO) << "paddle version:\n" << get_version(); + auto version = get_version(); + ASSERT_FALSE(version.empty()); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index f89eaeaadcc50fd7979d6807e8f2c7556e048e6c..5b899b26d60dec3634d7016c925143e1ae26992d 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -42,6 +42,10 @@ struct AnalysisConfig { explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& prog_file, const std::string& params_file); + enum class Precision { + kFloat32 = 0, + kInt8, + }; /** Set model with a directory. */ @@ -135,7 +139,8 @@ struct AnalysisConfig { * subgraph is less than this, it will not transfer to TensorRT engine. */ void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1, int min_subgraph_size = 3); + int max_batch_size = 1, int min_subgraph_size = 3, + Precision precision = Precision::kFloat32); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -229,6 +234,7 @@ struct AnalysisConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; + Precision tensorrt_precision_mode_; // memory reuse related. bool enable_memory_optim_{false}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 46b510fd1ec94c59032b8f41a2ac4d6aa87dc150..406983224615fbdb649301f1ffe3fbd136938a61 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -215,6 +215,14 @@ class PaddlePredictor { */ virtual ~PaddlePredictor() = default; + /** \brief Get the serialized model program that executes in inference phase. + * Its data type is ProgramDesc, which is a protobuf message. + */ + virtual std::string GetSeriazlizedProgram() const { + assert(false); // Force raise error. + return "NotImplemented"; + }; + /** The common configs for all the predictors. */ struct Config { @@ -288,4 +296,6 @@ std::unique_ptr CreatePaddlePredictor(const ConfigT& config); int PaddleDtypeSize(PaddleDType dtype); +std::string get_version(); + } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 9afeafd176c70bc03166ec7732ae5e2faf67ea54..f4977d08c4d051b8a528e122c47948c3c81d153c 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,4 @@ -nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 78b590f15d639f7b21b403413760948c6343d998..10f48462cfaf8073a4f5537d654d614d36b74db4 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); + if (enable_int8_) { + infer_builder_->setInt8Mode(true); + PADDLE_ENFORCE( + calibrator_ != nullptr, + "The precision mode is 'INT8', the calibrator should not be nullptr"); + infer_builder_->setInt8Calibrator(calibrator_); + } infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 65ab7f3caaa746cf339de67706939070a0b7d87d..cdfe09b5a7fd2d1f8548dab9421f671f5a345153 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,12 +23,14 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { namespace tensorrt { +class TRTInt8Calibrator; /* * TensorRT Engine. * @@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase { }; TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, - int device = 0, + int device = 0, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream), - logger_(logger), - device_(device) {} + device_(device), + enable_int8_(enable_int8), + calibrator_(calibrator), + logger_(logger) {} virtual ~TensorRTEngine(); @@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase { // In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the // paddle-tensorrt will do the merging optimization, which fuse those conv - // into - // one conv, and then trigger bug. So, We should use strategy to avoid this + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this // optimization for the time being. This bug will be fixed in the future. std::unordered_map itensor_quote_num; @@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; + cudaStream_t stream_; + // The specific GPU id that the TensorRTEngine bounded to. + int device_; + + bool enable_int8_; + TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; - cudaStream_t stream_; nvinfer1::ILogger& logger_; @@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase { std::unordered_map itensor_map_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; std::vector> owned_plugin_; // TensorRT related internal members diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a85c8b8fe6d70052edd3be59f98582c9b2e86b9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#include "glog/logging.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// set the batch size before constructing the thread to execute engine +int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } + +TRTInt8Calibrator::TRTInt8Calibrator( + const std::unordered_map& buffers, int batch_size, + std::string engine_name, const platform::Place place) + : batch_size_(batch_size), engine_name_(engine_name) { + int i = 0; + VLOG(4) << "Init a new calibrator: " << engine_name_; + for (const auto it : buffers) { + framework::Tensor temp_tensor; + std::string input_name = it.first; + int data_size = it.second; + int num_ele = data_size / sizeof(int16_t); + framework::DDim data_shape = framework::make_ddim({num_ele}); + temp_tensor.Resize(data_shape); + data_tensors_.push_back(temp_tensor); + data_buffers_[input_name] = std::pair( + static_cast(temp_tensor.mutable_data(place)), num_ele); + i += 1; + } +} + +TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data) + : batch_size_(0), + calib_running_(false), + data_is_set_(false), + done_(true), + calibration_table_(calib_data) {} + +void TRTInt8Calibrator::waitAndSetDone() { + std::unique_lock lk(mut_); + while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk); + if (!done_) { + done_ = true; + cond_.notify_all(); + } +} + +// There might be more than one input for trt subgraph, +// So, we use a map to store input information. +bool TRTInt8Calibrator::setBatch( + const std::unordered_map& data) { + VLOG(3) << "set batch: " << engine_name_; + std::unique_lock lk(mut_); + // There is a producer and a consumer. The producer set the batch data and + // the consumer get the batch data. The size of the data pool is one. + // So, the producer has to wait for the consumer to finish processing before + // they can set the data. + while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + // The done_ is set to true using waitAndSetDone, When all calibration data + // are processed. + if (done_) return false; + + // Sets the batch. + for (const auto& it : data) { + auto dataptr = data_buffers_.find(it.first); + if (dataptr == data_buffers_.end()) { + LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first + << "' does not match with the buffer names"; + } + const auto& d = dataptr->second; + PADDLE_ENFORCE( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), + "Fail to cudaMemcpy %s for %s", engine_name_, it.first); + } + + data_is_set_ = true; + cond_.notify_all(); + return true; +} + +bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, + int num_bindings) { + VLOG(4) << "get batch: " << engine_name_; + std::unique_lock lk(mut_); + // The consumer has just finished processing a data. + // The producer can set the data again. + calib_running_ = false; + cond_.notify_all(); + + // As long as there is data in the pool, the consumer can get it. + while (!data_is_set_ && !done_) cond_.wait(lk); + if (done_) return false; + + // Gets the batch + for (int i = 0; i < num_bindings; i++) { + auto it = data_buffers_.find(names[i]); + if (it == data_buffers_.end()) { + LOG(FATAL) << "Calibration engine asked for unknown tensor name '" + << names[i] << "' at position " << i; + } + bindings[i] = it->second.first; + } + + data_is_set_ = false; + calib_running_ = true; + VLOG(4) << "get batch done: " << engine_name_; + return true; +} + +void TRTInt8Calibrator::setDone() { + std::unique_lock lk(mut_); + done_ = true; + cond_.notify_all(); +} + +const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) { + if (calibration_table_.empty()) return nullptr; + length = calibration_table_.size(); + return calibration_table_.data(); +} + +void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, + std::size_t length) { + calibration_table_ = std::string((const char*)ptr, length); + VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr + << " length=" << length; +} +TRTInt8Calibrator::~TRTInt8Calibrator() { + VLOG(4) << "Destroying calibrator for " << engine_name_; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h new file mode 100644 index 0000000000000000000000000000000000000000..919f5d55f88c3a6473f66371e2f3d91f3c4721c5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TensorRTEngine; + +struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { + public: + TRTInt8Calibrator(const std::unordered_map& buffers, + int batch_size, std::string engine_name, + const platform::Place place); + + explicit TRTInt8Calibrator(const std::string& calibration_data); + ~TRTInt8Calibrator(); + + int getBatchSize() const override; + + bool getBatch(void* bindings[], const char* names[], + int num_bindings) override; + + bool setBatch(const std::unordered_map& data); + void setDone(); + void waitAndSetDone(); + + const void* readCalibrationCache(std::size_t& length) override; + void writeCalibrationCache(const void* ptr, std::size_t length) override; + const std::string& getCalibrationTableAsString() { + return calibration_table_; + } + + private: + const int batch_size_; + + bool calib_running_{true}; + bool data_is_set_{false}; + bool done_{false}; + + std::mutex mut_; + std::condition_variable cond_; + + std::unordered_map> data_buffers_; + std::vector data_tensors_; + + std::string engine_name_; + std::string calibration_table_; +}; + +class TRTCalibratorEngine { + public: + TRTCalibratorEngine() {} + std::unique_ptr calib_; + std::unique_ptr thr_; + std::unique_ptr engine_; +}; +/* + * Manager to control the TensorRT Int8 calibration creation and deltetion. + */ +class TRTCalibratorEngineManager { + public: + bool Has() const { return res_.size() > 0; } + bool Has(const std::string& name) const { + if (res_.count(name) == 0) return false; + return res_.at(name).get() != nullptr; + } + + // Get Int8Calibrator via name + TRTCalibratorEngine* Get(const std::string& name) const { + return res_.at(name).get(); + } + + // Look up or create a calibrator. + TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) { + if (res_.count(engine_name) == 0) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + } + return res_.at(engine_name).get(); + } + + // Create an Int8Calibrator + TRTCalibratorEngine* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto& item : res_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> res_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 423c39813f05af0d6aaade184914e6777c9b8a83..b0f7dcc0dffcb71a8b88c764dbced05fde36745e 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -54,6 +54,7 @@ else() message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") endif() + # RNN2 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") @@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) endif() inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) +# googlenet +inference_analysis_api_test_with_fake_data(test_analyzer_googlenet + "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL) + # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) @@ -123,6 +128,11 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) +# bert +set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert") +download_model_and_data(${BERT_INSTALL_DIR} "bert_model.tar.gz" "bert_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) + # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # anakin rnn1 diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..24cbd39ea0fbf425a8fed66f6413008f1a97408d --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -0,0 +1,229 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; +using paddle::contrib::AnalysisConfig; + +template +void GetValueFromStream(std::stringstream *ss, T *t) { + (*ss) >> (*t); +} + +template <> +void GetValueFromStream(std::stringstream *ss, std::string *t) { + *t = ss->str(); +} + +// Split string to vector +template +void Split(const std::string &line, char sep, std::vector *v) { + std::stringstream ss; + T t; + for (auto c : line) { + if (c != sep) { + ss << c; + } else { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } + } + + if (!ss.str().empty()) { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } +} + +template +constexpr paddle::PaddleDType GetPaddleDType(); + +template <> +constexpr paddle::PaddleDType GetPaddleDType() { + return paddle::PaddleDType::INT64; +} + +template <> +constexpr paddle::PaddleDType GetPaddleDType() { + return paddle::PaddleDType::FLOAT32; +} + +// Parse tensor from string +template +bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { + std::vector data; + Split(field, ':', &data); + if (data.size() < 2) return false; + + std::string shape_str = data[0]; + + std::vector shape; + Split(shape_str, ' ', &shape); + + std::string mat_str = data[1]; + + std::vector mat; + Split(mat_str, ' ', &mat); + + tensor->shape = shape; + auto size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + tensor->data.Resize(size); + std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); + tensor->dtype = GetPaddleDType(); + + return true; +} + +// Parse input tensors from string +bool ParseLine(const std::string &line, + std::vector *tensors) { + std::vector fields; + Split(line, ';', &fields); + + if (fields.size() < 5) return false; + + tensors->clear(); + tensors->reserve(5); + + int i = 0; + // src_id + paddle::PaddleTensor src_id; + ParseTensor(fields[i++], &src_id); + tensors->push_back(src_id); + + // pos_id + paddle::PaddleTensor pos_id; + ParseTensor(fields[i++], &pos_id); + tensors->push_back(pos_id); + + // segment_id + paddle::PaddleTensor segment_id; + ParseTensor(fields[i++], &segment_id); + tensors->push_back(segment_id); + + // self_attention_bias + paddle::PaddleTensor self_attention_bias; + ParseTensor(fields[i++], &self_attention_bias); + tensors->push_back(self_attention_bias); + + // next_segment_index + paddle::PaddleTensor next_segment_index; + ParseTensor(fields[i++], &next_segment_index); + tensors->push_back(next_segment_index); + + return true; +} + +bool LoadInputData(std::vector> *inputs) { + if (FLAGS_infer_data.empty()) { + LOG(ERROR) << "please set input data path"; + return false; + } + + std::ifstream fin(FLAGS_infer_data); + std::string line; + int sample = 0; + + // The unit-test dataset only have 10 samples, each sample have 5 feeds. + while (std::getline(fin, line)) { + std::vector feed_data; + ParseLine(line, &feed_data); + inputs->push_back(std::move(feed_data)); + sample++; + if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break; + } + LOG(INFO) << "number of samples: " << sample; + + return true; +} + +void SetConfig(contrib::AnalysisConfig *config) { + config->SetModel(FLAGS_infer_model); +} + +void profile(bool use_mkldnn = false) { + contrib::AnalysisConfig config; + SetConfig(&config); + + if (use_mkldnn) { + config.EnableMKLDNN(); + } + + std::vector outputs; + std::vector> inputs; + LoadInputData(&inputs); + TestPrediction(reinterpret_cast(&config), + inputs, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_bert, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, profile_mkldnn) { profile(true); } +#endif + +// Check the fuse status +TEST(Analyzer_bert, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + + std::vector> inputs; + LoadInputData(&inputs); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), inputs); +} + +TEST(Analyzer_bert, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +// TODO(luotao): Since each unit-test on CI only have 10 minutes, cancel this to +// decrease the CI time. +// TEST(Analyzer_bert, compare_determine) { +// AnalysisConfig cfg; +// SetConfig(&cfg); +// +// std::vector> inputs; +// LoadInputData(&inputs); +// CompareDeterministic(reinterpret_cast(&cfg), +// inputs); +// } +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 5d8684f083bda8499000c9fd0a7617cf129db13b..8759ec8096cf102ab85d2c2a91eddc23a6ed0e50 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -13,9 +13,15 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/legacy_allocator.h" + #include #include #include + +#ifdef PADDLE_WITH_JEMALLOC +#include +#endif + #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" @@ -95,7 +101,11 @@ struct NaiveAllocator { template <> void *Alloc(const platform::CPUPlace &place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); +#ifdef PADDLE_WITH_JEMALLOC + void *p = malloc(size); +#else void *p = GetCPUBuddyAllocator()->Alloc(size); +#endif if (FLAGS_init_allocated_mem) { memset(p, 0xEF, size); } @@ -107,12 +117,21 @@ template <> void Free(const platform::CPUPlace &place, void *p, size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); +#ifdef PADDLE_WITH_JEMALLOC + free(p); +#else GetCPUBuddyAllocator()->Free(p); +#endif } template <> size_t Used(const platform::CPUPlace &place) { +#ifdef PADDLE_WITH_JEMALLOC + // fake the result of used memory when PADDLE_WITH_JEMALLOC is ON + return 0U; +#else return GetCPUBuddyAllocator()->Used(); +#endif } #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index 27ca1f4edc04f5fca54b1a6340243634a596939c..e9f06f54327875c0568c571627e9effb998e15be 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -85,7 +85,7 @@ class ProtoEncodeHelper { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised if (paddle::platform::is_error(p_ <= limit_)) { - paddle::platform::throw_on_error(p_ <= limit_); + paddle::platform::throw_on_error(p_ <= limit_, ""); } #undef REPLACE_ENFORCE_GLOG } diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 913ae76b38dc663d6fb4102f795ac713fd8a6bdf..a1c5c0777402b808eed6306862fd6dd41b529dbd 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -54,6 +54,11 @@ bool RequestSendHandler::Handle(const std::string& varname, // Async if (!sync_mode_) { VLOG(3) << "async process var: " << varname; + if (varname == BATCH_BARRIER_MESSAGE) { + PADDLE_THROW( + "async mode should not recv BATCH_BARRIER_MESSAGE or " + "COMPLETE_MESSAGE"); + } try { executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), scope); diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index cc5b9c29a12ec5386041dfeea22fd388d94115e6..c3a46e348c69a20953f013c7de772a37db5f4844 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -39,27 +39,33 @@ void RPCServer::SavePort() const { port_file.open(file_path); port_file << selected_port_; port_file.close(); - VLOG(4) << "selected port written to " << file_path; + VLOG(3) << "selected port written to " << file_path; } void RPCServer::WaitBarrier(const std::string& rpc_name) { + VLOG(3) << "WaitBarrier in: " << rpc_name; std::unique_lock lock(this->mutex_); barrier_cond_.wait(lock, [this, &rpc_name] { return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || exit_flag_.load()); }); - VLOG(3) << "batch_barrier_: " << rpc_name << " " - << barrier_counter_[rpc_name]; + VLOG(3) << "WaitBarrier out: " << rpc_name + << " counter: " << barrier_counter_[rpc_name]; } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + // barrier msg should make sure that it's in the right cond(send|recv) + WaitCond(rpc_name); int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; + VLOG(3) << rpc_name << " barrier_counter: " << b; if (b >= client_num_) { lock.unlock(); + VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " + << rpc_name; barrier_cond_.notify_all(); lock.lock(); } @@ -71,7 +77,7 @@ void RPCServer::Complete() { client_num_--; need_reset_all_vars_ = true; - VLOG(4) << "decrease client_num to: " << client_num_; + VLOG(3) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; } @@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, static int cond = -1; rpc_cond_map_[rpc_name] = ++cond; - VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler - << ", cond:" << rpc_cond_map_[rpc_name]; + VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler + << ", cond: " << rpc_cond_map_[rpc_name]; } void RPCServer::SetCond(const std::string& rpc_name) { @@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(4) << "RPCServer WaitCond " << rpc_name; + VLOG(3) << "RPCServer WaitCond in " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); @@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) { std::unique_lock lock(mutex_); rpc_cond_.wait( lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); + VLOG(3) << "RPCServer WaitCond out " << rpc_name; } void RPCServer::RegisterVar(const std::string& var_name, @@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name, } rpc_cond_.notify_all(); - VLOG(4) << "RegisterVar context:" << h.String(); + VLOG(3) << "RegisterVar context:" << h.String(); } void RPCServer::IncreaseVarBarrier(const std::string& var_name) { @@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) { barrier_cond_.notify_all(); } - VLOG(4) << "IncreaseVarBarrier context:" << h.String(); + VLOG(3) << "IncreaseVarBarrier context:" << h.String(); } void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(4) << "WaitBarrier var_name:" << var_name; + VLOG(3) << "WaitVarBarrier var_name:" << var_name; std::unique_lock lock(mutex_); barrier_cond_.wait(lock, [&]() { @@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) { exit_flag_.load()); }); - VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); + VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); } void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(4) << "SetVarCond var_name:" << var_name; + VLOG(3) << "SetVarCond var_name:" << var_name; { std::unique_lock lock(mutex_); if (var_map_.find(var_name) != var_map_.end()) { @@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) { } void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(4) << "WaitVarCond var_name:" << var_name; + VLOG(3) << "WaitVarCond var_name:" << var_name; std::unique_lock lock(mutex_); rpc_cond_.wait(lock, [=] { return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); }); - VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; + VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; } MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 53968831ea0d640d13fc69ce1855257e8deed54c..5b30ed472d51a37a0705d1717395da9e4ff7d743 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop( while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. + VLOG(3) << "wait all clients to send gradient"; rpc_service_->SetCond(distributed::kRequestSend); + VLOG(3) << "wait all clients to send send_barrier"; rpc_service_->WaitBarrier(distributed::kRequestSend); if (rpc_service_->IsExit()) { @@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop( } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "ResetReceivedVars"; ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); + VLOG(3) << "wait all clients to get parameters back"; rpc_service_->SetCond(distributed::kRequestGet); + VLOG(3) << "wait all clients to send fetch_barrier"; rpc_service_->WaitBarrier(distributed::kRequestGet); + VLOG(3) << "ResetBarrierCounter"; rpc_service_->ResetBarrierCounter(); } // while(true) } diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 14a2524bd8f4a9f7685c84f1d9767f5f7eedf0e7..241184c6f4a19a1da0d6d75c5d4e2b372c14e9da 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2."); PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0], "Input(X) and Input(Grid) dims[0] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[1], x_dims[2], - "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[2], x_dims[3], - "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + grid_dims[1], x_dims[2], + "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); + PADDLE_ENFORCE_EQ( + grid_dims[2], x_dims[3], + "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + } ctx->SetOutputDim("Output", x_dims); ctx->ShareLoD("X", "Output"); diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 4e4f977fcc742856b877ef0b7f9a3cc9879aefce..097ba01d401dbc7969e30f576cac2567c874ed99 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { mid->mutable_data(ctx.GetPlace()); const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); const bool is_test = ctx.Attr("is_test"); @@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto dims = paddle::framework::vectorize2int(x->dims()); auto src_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto dst_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { k}; auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, - static_cast(output_data)}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; + auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), + static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; @@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_workspace_memory = key + "@lrn_workspace_memory"; const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index b993c55fad13e892efd51648b78704bec83bf2b4..031335009b692f9d1f73070c88e8e79d852cbe36 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); + AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different TRT Engines"); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); + AddAttr("sub_block", "the trt block"); + AddAttr("enable_int8", "whether swith to int8 mode"); AddComment("TensorRT engine operator."); } }; @@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e7e990f759ba411f6954c51fb697a6befbad31b1..2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -17,8 +17,10 @@ #ifdef PADDLE_WITH_CUDA #include +#include #include +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" @@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { using inference::Singleton; using inference::tensorrt::TensorRTEngine; +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; class TensorRTEngineOp : public framework::OperatorBase { private: @@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase { mutable std::unique_ptr trt_engine_; int max_batch_size_; int workspace_size_; + std::unique_ptr calibrator_; + bool enable_int8_; + std::string calibration_data_; + std::string engine_key_; + bool calibration_mode_; public: TensorRTEngineOp(const std::string &type, @@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); + enable_int8_ = Attr("enable_int8"); + calibration_data_ = Attr("calibration_data"); + engine_key_ = Attr("engine_key"); auto params = Attr>("parameters"); for (const auto ¶m : params) { param_names_.insert(param); } + // calibration_mode is ture represents we need to + // generate the calibration table data. + calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); + + VLOG(4) << "calibration_mode: " << calibration_mode_; + if (enable_int8_ && calibration_data_.size()) { + calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); + } } protected: + void RunNativeImpl(const framework::Scope &scope, + const platform::Place &dev_place) const { + framework::Executor executor(dev_place); + auto *block = Attr("sub_block"); + auto *program = block->Program(); + auto ¤t_scope = scope.NewScope(); + auto ctx = executor.Prepare(*program, block->ID()); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); + } + void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { + if (calibration_mode_ == true) { + RunCalibration(scope, dev_place); + return; + } RunTrt(scope, dev_place); } + void RunCalibration(const framework::Scope &scope, + const platform::Place &dev_place) const { + // This process will builds a 32-bit trt engine, runs it on the calibration + // set, and records a histogram for each + // tensor of the distribution of activation values. + LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ + << " is running calibration trt int8... "; + int runtime_batch = 1; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); + if (!Singleton::Global().Has(engine_key_)) { + TRTCalibratorEngine *calib_res = + Singleton::Global().Create(engine_key_); + std::unordered_map calib_buffers; + for (auto &x : input_names_) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_buffers[x] = t.memory_size(); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; + } + calib_res->calib_.reset(new TRTInt8Calibrator( + calib_buffers, runtime_batch, engine_key_, dev_place)); + calib_res->thr_.reset(new std::thread([&]() { + calib_res->engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, stream, + boost::get(dev_place).device, enable_int8_, + calib_res->calib_.get())); + VLOG(3) << "start the calib trt engine thread"; + Prepare(scope, dev_place, calib_res->engine_.get()); + })); + } + + TRTInt8Calibrator *temp_calibrator = + Singleton::Global() + .Get(engine_key_) + ->calib_.get(); + std::unordered_map calib_data; + + for (auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_data.emplace(x, t.data()); + } + temp_calibrator->setBatch(calib_data); + RunNativeImpl(scope, dev_place); + } + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place) const { int runtime_batch = 1; @@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase { auto stream = reinterpret_cast(dev_ctx).stream(); if (trt_engine_.get() == nullptr) { - trt_engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device)); + trt_engine_.reset( + new TensorRTEngine(max_batch_size_, workspace_size_, stream, + boost::get(dev_place).device, + enable_int8_, calibrator_.get())); Prepare(scope, dev_place, trt_engine_.get()); } @@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase { void Prepare(const framework::Scope &scope, const platform::Place &dev_place, TensorRTEngine *engine) const { - VLOG(4) << "Prepare engine"; + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; framework::proto::BlockDesc block_desc; block_desc.ParseFromString(Attr("subgraph")); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 391e7a1c070e040f6e90f820634c0d8b7cd40a96..5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); - SetAttr>(engine_op_desc.Proto(), "parameters", - std::vector({})); - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z0"})); + + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(2)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", std::vector({})); + engine_op_desc.SetAttr("engine_key", std::string("a_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z0"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); LOG(INFO) << "create engine op"; - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); LOG(INFO) << "engine_op " << engine_op.get(); framework::Scope scope; @@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetInput("Xs", std::vector({"x0"})); engine_op_desc.SetOutput("Ys", std::vector({"z3"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); - SetAttr>( - engine_op_desc.Proto(), "parameters", - std::vector({"y0", "y1", "y2", "y3"})); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); - - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z3"})); - - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(batch_size)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", + std::vector({"y0", "y1", "y2", "y3"})); + engine_op_desc.SetAttr("engine_key", std::string("b_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z3"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); // Execute them. engine_op->Run(scope, place); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 15413785bab3c0fd77244141e8f1840ca0cc1356..142d38f0609d963ce3ff45c595b8432b0e5edd21 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception { } } - template - EnforceNotMet(const char* f, int l, ARGS... args) { - Init(string::Sprintf(args...), f, l); + EnforceNotMet(const std::string& str, const char* f, int l) { + Init(str, f, l); } const char* what() const noexcept override { return err_str_.c_str(); } @@ -142,28 +141,23 @@ struct EOFException : public std::exception { inline bool is_error(bool stat) { return !stat; } -template -inline typename std::enable_if::type throw_on_error( - bool stat, const Args&... args) { +inline void throw_on_error(bool stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << msg; #endif } #ifdef PADDLE_WITH_CUDA -inline bool is_error(cudaError_t e) { return UNLIKELY(e); } +inline bool is_error(cudaError_t e) { return e != cudaSuccess; } -template -inline typename std::enable_if::type throw_on_error( - cudaError_t e, const Args&... args) { +inline void throw_on_error(cudaError_t e, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << msg; #endif } @@ -171,14 +165,12 @@ inline bool is_error(curandStatus_t stat) { return stat != CURAND_STATUS_SUCCESS; } -template -inline typename std::enable_if::type throw_on_error( - curandStatus_t stat, const Args&... args) { +inline void throw_on_error(curandStatus_t stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << msg; #endif } @@ -186,14 +178,11 @@ inline bool is_error(cudnnStatus_t stat) { return stat != CUDNN_STATUS_SUCCESS; } -template -inline typename std::enable_if::type throw_on_error( - cudnnStatus_t stat, const Args&... args) { +inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg; #endif } @@ -201,9 +190,7 @@ inline bool is_error(cublasStatus_t stat) { return stat != CUBLAS_STATUS_SUCCESS; } -template -inline typename std::enable_if::type throw_on_error( - cublasStatus_t stat, const Args&... args) { +inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { std::string err; if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; @@ -225,87 +212,45 @@ inline typename std::enable_if::type throw_on_error( err = "CUBLAS: license error, "; } #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(err + string::Sprintf(args...)); + throw std::runtime_error(err + msg); #else - LOG(FATAL) << err << string::Sprintf(args...); + LOG(FATAL) << err << msg; #endif } #if !defined(__APPLE__) && !defined(_WIN32) -template -inline typename std::enable_if::type throw_on_error( - ncclResult_t stat, const Args&... args) { - if (stat == ncclSuccess) { - return; - } else { +inline bool is_error(ncclResult_t nccl_result) { + return nccl_result != ncclSuccess; +} + +inline void throw_on_error(ncclResult_t stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg); #else - LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) - << string::Sprintf(args...); + LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg; #endif - } } #endif // __APPLE__ and windows #endif // PADDLE_WITH_CUDA -template -inline void throw_on_error(T e) { - throw_on_error(e, ""); -} - -#define PADDLE_THROW(...) \ - throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) - -#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; - -#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ - ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); - -#ifdef _WIN32 -#define __PADDLE_THROW_ON_ERROR(COND, ...) \ - __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__) -#else // _WIN32 -#define __PADDLE_THROW_ON_ERROR(COND, ...) \ - __PADDLE_THROW_ERROR_I( \ - __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) -#endif // _WIN32 - -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto __cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ - } \ +#define PADDLE_THROW(...) \ + throw ::paddle::platform::EnforceNotMet( \ + ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__) + +#define PADDLE_ENFORCE(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, ::paddle::string::Sprintf(__VA_ARGS__)); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ + } \ } while (0) -#ifndef REPLACE_ENFORCE_GLOG -#define __PADDLE_ENFORCE_I(COND, ...) \ - do { \ - try { \ - __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ - } catch (...) { \ - throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ - __FILE__, __LINE__); \ - } \ - } while (0) - -#else -#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); -#endif // REPLACE_ENFORCE_GLOG - -#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args -#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) - #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 8df8e32098697540f02d488c873f5ae7fb29828e..6ae21ee8294bedc388f837aad3e20a2b9aca98a2 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -64,7 +64,7 @@ class NCCLGroupGuard { } inline ~NCCLGroupGuard() { - CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess); + PADDLE_ENFORCE(dynload::ncclGroupEnd()); NCCLMutex().unlock(); } }; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 26247026667158a2f43cdac21bf5600479455e16..e05667d2c7e9ce5c64cfacee4919cd36d7383c0c 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) { } void BindAnalysisConfig(py::module *m) { - py::class_(*m, "AnalysisConfig") - .def(py::init()) + py::class_ analysis_config(*m, "AnalysisConfig"); + + py::enum_(analysis_config, "Precision") + .value("Float32", AnalysisConfig::Precision::kFloat32) + .value("Int8", AnalysisConfig::Precision::kInt8) + .export_values(); + + analysis_config.def(py::init()) .def(py::init()) .def(py::init()) .def("set_model", (void (AnalysisConfig::*)(const std::string &)) & @@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { .def("specify_input_name", &AnalysisConfig::specify_input_name) .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, - py::arg("min_subgraph_size") = 3) + py::arg("min_subgraph_size") = 3, + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 0b94b60018aac3a61edfda4d7ecb762e9fe70673..16bb3771f2e9bcc07028ef2039fed8691f9aab97 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...)); } +inline std::string Sprintf() { return ""; } + template std::string Sprintf(const Args&... args) { std::ostringstream oss; diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index c2156a436ec73d03082fa08b6250dc77b2cee19f..1135caf4f8c32901d93270d372fdaac702acf006 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -173,7 +173,6 @@ function cmake_gen() { -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} -DWITH_DSO=ON - -DWITH_DOC=${WITH_DOC:-OFF} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} @@ -208,7 +207,6 @@ EOF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ -DWITH_DSO=ON \ - -DWITH_DOC=${WITH_DOC:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${distibuted_flag} \ @@ -528,31 +526,6 @@ function bind_test() { wait } - -function gen_docs() { - mkdir -p ${PADDLE_ROOT}/build - cd ${PADDLE_ROOT}/build - cat <