From 6fbd224e69cda5cf0f1ca7f12e9a9ec9dc26fa0f Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Mon, 26 Aug 2019 10:44:21 +0800 Subject: [PATCH] =?UTF-8?q?CHERRY=20PICK=20FROM=2018941,=2018860,=20=20192?= =?UTF-8?q?13=EF=BC=9AFix=20Mask=20RCNN=20bug=20AND=20Paddle-TRT=20fp16=20?= =?UTF-8?q?support=20=20=20(#19378)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CHERRY_PICK 18941, 18860: TRT fp16 support. test=release/1.5 * CHERRY_PICK 19213: Fix BUG: Mask RCNN inference diff When using AnalysisPredictor. 1. fix affine channel fuse pass. 2. fix condition block op. 3. fix merge lod tensor op bug. 4. fix memory optim cause by reset lod op. test=release/1.5 --- paddle/fluid/framework/ir/CMakeLists.txt | 1 - .../framework/ir/infer_clean_graph_pass.cc | 67 ------------------- paddle/fluid/inference/analysis/analyzer.cc | 3 + .../inference/analysis/analyzer_tester.cc | 4 +- paddle/fluid/inference/analysis/argument.h | 4 ++ .../inference/analysis/ir_pass_manager.cc | 6 +- .../ir_passes/tensorrt_subgraph_pass.cc | 6 +- .../ir_passes/tensorrt_subgraph_pass.h | 1 + .../inference/analysis/passes/CMakeLists.txt | 2 + .../passes/inference_op_replace_pass.cc | 2 +- .../analysis/passes/ir_graph_clean_pass.cc | 47 +++++++++++++ .../analysis/passes/ir_graph_clean_pass.h | 34 ++++++++++ .../analysis/passes/memory_optimize_pass.cc | 10 ++- .../analysis/passes/memory_optimize_pass.h | 1 + .../fluid/inference/analysis/passes/passes.cc | 3 + .../fluid/inference/api/analysis_predictor.cc | 19 ++---- .../fluid/inference/api/analysis_predictor.h | 2 - .../api/analysis_predictor_tester.cc | 3 - .../fluid/inference/api/mkldnn_quantizer.cc | 5 +- .../inference/api/paddle_analysis_config.h | 1 + .../inference/api/paddle_pass_builder.cc | 12 ++-- .../fluid/inference/api/paddle_pass_builder.h | 2 +- .../tensorrt/convert/test_op_converter.cc | 6 +- .../inference/tensorrt/convert/ut_helper.h | 3 +- paddle/fluid/inference/tensorrt/engine.cc | 22 +++++- paddle/fluid/inference/tensorrt/engine.h | 25 ++++--- paddle/fluid/operators/merge_lod_tensor_op.cc | 37 +++++++++- .../operators/tensorrt/tensorrt_engine_op.h | 14 +++- .../tensorrt/tensorrt_engine_op_test.cc | 2 + paddle/fluid/pybind/inference_api.cc | 1 + .../test_split_and_merge_lod_tensor_op.py | 65 ++++++++++++++---- 31 files changed, 268 insertions(+), 142 deletions(-) delete mode 100644 paddle/fluid/framework/ir/infer_clean_graph_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d50d841379..0699e2d6da 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -52,7 +52,6 @@ pass_library(graph_viz_pass base) pass_library(lock_free_optimize_pass base) pass_library(fc_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference) -pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc deleted file mode 100644 index d76924116f..0000000000 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { - -class InferCleanGraphPass : public FusePassBase { - public: - virtual ~InferCleanGraphPass() {} - - protected: - void ApplyImpl(ir::Graph* graph) const { - FusePassBase::Init("original_graph", graph); - PADDLE_ENFORCE(graph); - - auto is_valid_node = [](Node* x) { - return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); - }; - - std::unordered_set invalid_nodes; - int valid_op = 0; - for (auto* node : graph->Nodes()) { - PADDLE_ENFORCE_NOT_NULL(node); - if (is_valid_node(node)) { - invalid_nodes.insert(node); - } else if (node->IsOp()) { - // Collect all the operators to help tracking number of operators. - ++valid_op; - } - } - - GraphSafeRemoveNodes(graph, invalid_nodes); - - AddStatis(valid_op); - } - - void CleanEdges(std::vector* nodes, - const std::unordered_set& to_remove) const { - auto it = std::remove_if(nodes->begin(), nodes->end(), - [&](Node* x) { return to_remove.count(x); }); - nodes->erase(it, nodes->end()); - } -}; - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(infer_clean_graph_pass, - paddle::framework::ir::InferCleanGraphPass); diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index d82a063d88..71fdb5570c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -31,6 +31,9 @@ void Analyzer::RunAnalysis(Argument *argument) { "analsis_passes is not valid in the argument."); for (auto &pass : argument->analysis_passes()) { string::PrettyLogH1("--- Running analysis [%s]", pass); + if (!argument->enable_analysis_optim() && pass == "ir_analysis_pass") + continue; + auto *ptr = PassRegistry::Global().Retreive(pass); PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass); ptr->Run(argument); diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index c814ce4548..489345da49 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -30,7 +30,7 @@ using namespace framework; // NOLINT TEST(Analyzer, analysis_without_tensorrt) { Argument argument; argument.SetModelDir(FLAGS_inference_model_dir); - argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + argument.SetEnableAnalysisOptim(false); argument.SetUseGPU(false); argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass"}); @@ -41,10 +41,10 @@ TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) { Argument argument; + argument.SetEnableAnalysisOptim(false); argument.SetTensorRtMaxBatchSize(3); argument.SetTensorRtWorkspaceSize(1 << 20); argument.SetModelDir(FLAGS_inference_model_dir); - argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetUseGPU(false); argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass"}); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index e468bc2267..53db72e092 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -62,6 +62,9 @@ struct Argument { using anakin_max_shape_t = std::map>; bool Has(const std::string& key) const { return valid_fields_.count(key); } + // If we set the model using config.SetModelBuffer, + // the model and parameter will occupy additional CPU resources. + // Use this interface to release these resources. void PartiallyRelease() { if (Has("model_program_path")) { if (Has("model_from_memory") && model_from_memory()) { @@ -130,6 +133,7 @@ struct Argument { DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string); + DECL_ARGUMENT_FIELD(enable_analysis_optim, EnableAnalysisOptim, bool); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 2dae513719..2b7f1dfeaf 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -84,13 +84,15 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); - bool enable_int8 = argument->tensorrt_precision_mode() == - AnalysisConfig::Precision::kInt8; + auto precision_mode = argument->tensorrt_precision_mode(); + bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; pass->Set("predictor_id", new int(argument->predictor_id())); bool use_calib_mode = argument->tensorrt_use_calib_mode(); pass->Set("enable_int8", new bool(enable_int8)); pass->Set("use_calib_mode", new bool(use_calib_mode)); + pass->Set("precision_mode", + new AnalysisConfig::Precision(precision_mode)); bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index ce8f57c0f0..1d1e4570e1 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -149,6 +149,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp( graph_var_map[node->Name()] = node; } } + auto precision_mode = Get("precision_mode"); + bool enable_fp16 = false; + if (precision_mode == AnalysisConfig::Precision::kHalf) enable_fp16 = true; auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); auto &subgraph_nodes = *Agent(node).subgraph(); @@ -216,6 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "enable_fp16", enable_fp16); SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode); SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "predictor_id", predictor_id); @@ -244,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( inference::Singleton::Global() .Create(engine_key + std::to_string(predictor_id), Get("max_batch_size"), Get("workspace_size"), - enable_int8, calibrator.get(), Get("gpu_device_id")); + precision_mode, calibrator.get(), Get("gpu_device_id")); bool need_serialize = (use_static_engine && !load_from_memory); if (need_serialize) { diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index f530a5a0b3..b6b67ce8ec 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 860dc30976..1c878d66ba 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -5,6 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass) +cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass) cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass @@ -14,6 +15,7 @@ cc_library(analysis_passes SRCS passes.cc DEPS memory_optim_pass inference_op_replace_pass ir_graph_to_program_pass + ir_graph_clean_pass ) set(analysis_deps ${analysis_deps} diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc index ef7d13da89..86ced982d3 100644 --- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc +++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc @@ -20,9 +20,9 @@ namespace inference { namespace analysis { void InferenceOpReplacePass::RunImpl(Argument* argument) { - if (!argument->use_gpu()) return; std::unordered_map replaced_map{ {"conditional_block", "conditional_block_infer"}, + {"merge_lod_tensor", "merge_lod_tensor_infer"}, }; auto& graph = argument->main_graph(); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc new file mode 100644 index 0000000000..1f888a28da --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h" +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrInferCleanGraphPass::RunImpl(Argument* argument) { + auto& graph = argument->main_graph(); + auto is_valid_node = [](framework::ir::Node* x) { + return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); + }; + + std::unordered_set invalid_nodes; + int valid_op = 0; + for (auto* node : graph.Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node); + if (is_valid_node(node)) { + invalid_nodes.insert(node); + } else if (node->IsOp()) { + ++valid_op; + } + } + + GraphSafeRemoveNodes(&graph, invalid_nodes); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h new file mode 100644 index 0000000000..a9d58aa2f4 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class IrInferCleanGraphPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + + std::string repr() const override { return "ir_graph_clean_pass"; } +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index c894acfd48..6ecaf08f7d 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -109,10 +109,16 @@ int DataTypeToSpace(framework::proto::VarType_Type type) { void MemoryOptimizePass::CollectVarMemorySize( space_table_t* space_table) const { const int fake_batch_size = 1; + auto valid_var = [&](framework::ir::Node* node) -> bool { - std::set invalid_op = {"while", "conditional_block", + std::set invalid_op = {"while", + "conditional_block", "tensorrt_engine", - "conditional_block_infer"}; + "conditional_block_infer", + "merge_lod_tensor_infer", + "merge_lod_tensor", + "equal", + "lod_reset"}; for (auto* tmp : node->inputs) { CHECK(tmp->IsOp()); std::string op_type = tmp->Op()->Type(); diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index 5a907303b4..90e285da09 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -75,6 +75,7 @@ class MemoryOptimizePass : public AnalysisPass { int sort_kind) const; void CollectVarMemorySize(space_table_t *space_table) const; + void CollectVarMemorySize0(space_table_t *space_table) const; void CollectVarMemorySize( const std::unordered_map &batch_var_ave_dim, diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index 97debcec56..ca0b25c29d 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" @@ -32,6 +33,8 @@ PassRegistry::PassRegistry() { std::unique_ptr(new IrAnalysisPass)); passes_.emplace("ir_graph_build_pass", std::unique_ptr(new IrGraphBuildPass)); + passes_.emplace("ir_graph_clean_pass", + std::unique_ptr(new IrInferCleanGraphPass)); passes_.emplace("memory_optimize_pass", std::unique_ptr(new MemoryOptimizePass)); passes_.emplace( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index e7a8549d3b..0ae6654476 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -135,7 +135,6 @@ bool AnalysisPredictor::PrepareProgram( const std::shared_ptr &program) { if (!program) { if (!LoadProgramDesc()) return false; - // If not cloned, the parameters should be loaded. // If config_.ir_optim() is True, parameters is loaded in // OptimizeInferenceProgram(), but other persistable variables @@ -145,17 +144,10 @@ bool AnalysisPredictor::PrepareProgram( // So in both case, create persistable variables at first. executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); - // Optimize the program, and load parameters and modify them in the - // scope_. - // This will change the scope_ address. - if (config_.ir_optim()) { - status_ir_optim_enabled_ = true; - OptimizeInferenceProgram(); - } else { - // Load parameters - LOG(INFO) << "load parameters "; - LoadParameters(); - } + // if enable_ir_optim_ is false, + // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will + // not be executed. + OptimizeInferenceProgram(); } else { // If the program is passed from external, no need to optimize it, this // logic is used in the clone scenario. @@ -363,6 +355,7 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, void AnalysisPredictor::PrepareArgument() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); + argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetStaticMemoryOptim(config_.static_memory_optim_); argument_.SetStaticMemoryOptimForceUpdate( @@ -434,8 +427,6 @@ void AnalysisPredictor::PrepareArgument() { // NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::OptimizeInferenceProgram() { - status_program_optimized_ = true; - PrepareArgument(); Analyzer().Run(&argument_); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index b5e134ced7..5deadcbe3e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -175,10 +175,8 @@ class AnalysisPredictor : public PaddlePredictor { private: // Some status here that help to determine the status inside the predictor. - bool status_program_optimized_{false}; bool status_is_cloned_{false}; bool status_use_gpu_{false}; - bool status_ir_optim_enabled_{false}; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 44b1b8071d..e990b2c773 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -44,7 +44,6 @@ TEST(AnalysisPredictor, analysis_off) { ASSERT_EQ(predictor->scope_->parent(), nullptr); ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); // ir is turned off, so program shouldn't be optimized. - ASSERT_FALSE(predictor->status_program_optimized_); LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size(); // 2. Dummy Input Data @@ -76,8 +75,6 @@ TEST(AnalysisPredictor, analysis_on) { ASSERT_TRUE(predictor->sub_scope_); ASSERT_EQ(predictor->scope_->parent(), nullptr); ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); - // ir is turned on, so program should be optimized. - ASSERT_TRUE(predictor->status_program_optimized_); // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; PaddleTensor tensor; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 9d560ddd2e..f9a4b460e2 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -367,13 +367,14 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { auto* builder = predictor_.config_.pass_builder(); builder->SetPasses({ - "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass", + "cpu_quantize_pass", "cpu_quantize_squash_pass", }); if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); auto passes = builder->AllPasses(); predictor_.argument_.SetIrAnalysisPasses(passes); predictor_.argument_.SetAnalysisPasses( - {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"}); + {"ir_graph_clean_pass", "ir_analysis_pass", "memory_optimize_pass", + "ir_graph_to_program_pass"}); predictor_.argument_.SetQuantVarScales(scales_); } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e94ca5e962..0fd9e1ff6a 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -46,6 +46,7 @@ struct AnalysisConfig { enum class Precision { kFloat32 = 0, kInt8, + kHalf, }; /** Set model with a directory. diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 7d5a77b4b8..6297adc86c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -71,8 +71,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } const std::vector kTRTSubgraphPasses({ - "infer_clean_graph_pass", // - "conv_affine_channel_fuse_pass", // + "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "quant_conv2d_dequant_fuse_pass", // "delete_quant_dequant_op_pass", // @@ -90,7 +89,6 @@ const std::vector kTRTSubgraphPasses({ // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ - "infer_clean_graph_pass", // "quant_conv2d_dequant_fuse_pass", // "simplify_anakin_priorbox_detection_out_pass", // "fillconstant_elementwisemul_fuse", // @@ -104,9 +102,8 @@ const std::vector kAnakinSubgraphPasses({ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - // "identity_scale_op_clean_pass", // - "conv_affine_channel_fuse_pass", // + // "identity_scale_op_clean_pass", // + "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // @@ -140,8 +137,7 @@ void GpuPassStrategy::EnableNgraph() { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { // NOTE the large fusions should be located in the front, so that they will // not be damaged by smaller ones. - passes_.assign({"infer_clean_graph_pass", // - "attention_lstm_fuse_pass", // + passes_.assign({"attention_lstm_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", // // "seqpool_concat_fuse_pass", // "seqpool_cvm_concat_fuse_pass", // diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 62b7ab3045..6aa59e0950 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -72,7 +72,7 @@ class PaddlePassBuilder { protected: std::vector analysis_passes_{ - {"ir_graph_build_pass", "ir_analysis_pass", + {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass", "inference_op_replace_pass"}}; std::vector passes_; diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index c5a413221e..5265566370 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include +#include // NOLINT #include "paddle/fluid/framework/program_desc.h" namespace paddle { @@ -27,10 +27,8 @@ TEST(OpConverter, ConvertBlock) { auto* conv2d_op = block->AppendOp(); // init trt engine - cudaStream_t stream_; std::unique_ptr engine_; - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_.reset(new TensorRTEngine(5, 1 << 15)); engine_->InitNetwork(); engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 388d83d834..97affafb4b 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -80,8 +80,7 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset( - new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size)); engine_->InitNetwork(); } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index c5ac6f3841..196cf2f89f 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -51,7 +51,27 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); - if (enable_int8_) { + bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf); +#if IS_TRT_VERSION_GE(5000) + if (enable_fp16) { + bool support_fp16 = infer_builder_->platformHasFastFp16(); + infer_builder_->setFp16Mode(support_fp16); + if (!support_fp16) { + LOG(INFO) << "You specify FP16 mode, but the hardware do not support " + "FP16 speed up, use FP32 instead."; + } else { + LOG(INFO) << "Run Paddle-TRT FP16 mode. "; + } + } +#else + if (enable_fp16) + LOG(INFO) << "Using FP16 in Paddle-trt must ensure that the version of TRT " + "is at least 5." + " So, use FP32 to run."; +#endif + bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8); + + if (enable_int8) { infer_builder_->setInt8Mode(true); if (calibrator_) { infer_builder_->setInt8Calibrator(calibrator_); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index d0a759d850..012c9fbb23 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -22,6 +22,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -61,12 +62,14 @@ class TensorRTEngine { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, - nvinfer1::ILogger& logger = NaiveLogger::Global()) + TensorRTEngine( + int max_batch, int max_workspace, + AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - enable_int8_(enable_int8), + precision_(precision), calibrator_(calibrator), device_id_(device_id), logger_(logger) {} @@ -168,7 +171,7 @@ class TensorRTEngine { // the max memory size the engine uses int max_workspace_; - bool enable_int8_; + AnalysisConfig::Precision precision_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; @@ -231,12 +234,12 @@ class TRTEngineManager { return engines_.at(name).get(); } - TensorRTEngine* Create(std::string name, int max_batch, int max_workspace, - bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, - int device_id = 0, - nvinfer1::ILogger& logger = NaiveLogger::Global()) { - auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8, + TensorRTEngine* Create( + std::string name, int max_batch, int max_workspace, + AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) { + auto* p = new TensorRTEngine(max_batch, max_workspace, precision, calibrator, device_id, logger); engines_[name].reset(p); return p; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 5edc233f6f..6a9d8222c4 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -28,9 +28,9 @@ class MergeLoDTensorOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { + protected: + void RunBase(const framework::Scope &scope, + const platform::Place &dev_place) const { // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); @@ -125,6 +125,33 @@ class MergeLoDTensorOp : public framework::OperatorBase { out_lod->insert(out_lod->begin(), x.lod()[i]); } } + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunBase(scope, dev_place); + } +}; + +class MergeLoDTensorInferOp : public MergeLoDTensorOp { + public: + MergeLoDTensorInferOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : MergeLoDTensorOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunBase(scope, dev_place); + framework::Variable *in_true_var = scope.FindVar(Input("InTrue")); + framework::Variable *in_false_var = scope.FindVar(Input("InFalse")); + in_true_var->Clear(); + in_false_var->Clear(); + in_true_var->GetMutable(); + in_false_var->GetMutable(); + } }; class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { @@ -196,3 +223,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp, ops::MergeLoDTensorOpProtoMaker, ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker); +REGISTER_OPERATOR(merge_lod_tensor_infer, ops::MergeLoDTensorInferOp, + ops::MergeLoDTensorOpProtoMaker, + ops::MergeLoDTensorInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 79c9f759aa..41492979cd 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -48,12 +48,14 @@ class TensorRTEngineOp : public framework::OperatorBase { int workspace_size_; std::unique_ptr calibrator_; bool enable_int8_; + bool enable_fp16_; bool use_calib_mode_; std::string calibration_data_; std::string engine_key_; bool calibration_mode_; int predictor_id_; int device_id_; + AnalysisConfig::Precision precision_mode_; public: TensorRTEngineOp(const std::string &type, @@ -66,6 +68,7 @@ class TensorRTEngineOp : public framework::OperatorBase { workspace_size_ = Attr("workspace_size"); device_id_ = Attr("gpu_id"); enable_int8_ = Attr("enable_int8"); + enable_fp16_ = Attr("enable_fp16"); use_calib_mode_ = Attr("use_calib_mode"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); @@ -93,6 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::Singleton::Global() .Get(engine_key_ + std::to_string(predictor_id_)); } + precision_mode_ = AnalysisConfig::Precision::kFloat32; + if (enable_int8_) { + precision_mode_ = AnalysisConfig::Precision::kInt8; + } + if (enable_fp16_) { + precision_mode_ = AnalysisConfig::Precision::kHalf; + } } protected: @@ -141,7 +151,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, enable_int8_, + max_batch_size_, workspace_size_, precision_mode_, calib_res->calib_.get(), boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; @@ -241,7 +251,7 @@ class TensorRTEngineOp : public framework::OperatorBase { trt_engine_ = inference::Singleton::Global() .Create(engine_key_ + std::to_string(predictor_id_), - max_batch_size_, workspace_size_, enable_int8_, + max_batch_size_, workspace_size_, precision_mode_, calibrator_.get(), device_id_); PrepareTRTEngine(scope, trt_engine_); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index efc50fc06f..e813e9ca75 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -105,6 +105,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); @@ -205,6 +206,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index d8664425b7..ae7fcad784 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -199,6 +199,7 @@ void BindAnalysisConfig(py::module *m) { py::enum_(analysis_config, "Precision") .value("Float32", AnalysisConfig::Precision::kFloat32) .value("Int8", AnalysisConfig::Precision::kInt8) + .value("Half", AnalysisConfig::Precision::kHalf) .export_values(); analysis_config.def(py::init()) diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py index 5397d5c521..f407eb1d8b 100644 --- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py @@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor from paddle.fluid.backward import append_backward from paddle.fluid.layers.control_flow import split_lod_tensor from paddle.fluid.layers.control_flow import merge_lod_tensor +from paddle.fluid.layer_helper import LayerHelper class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -57,7 +58,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): expect_false=expect_false, expect_out=tensor) - def test_split_and_merge_lod_tensor_level_0(self): + def split_and_merge_lod_tensor_level_0(self, use_merge_lod_infer=False): tensor = core.LoDTensor() tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place()) tensor.set_recursive_sequence_lengths([[3, 6, 1]]) @@ -87,10 +88,23 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): mask=mask, expect_true=expect_true, expect_false=expect_false, - expect_out=tensor) - - def main(self, tensor, mask, expect_true, expect_false, expect_out, - level=0): + expect_out=tensor, + use_merge_lod_infer=use_merge_lod_infer) + + def test_split_and_merge_lod_tensor_1(self): + self.split_and_merge_lod_tensor_level_0() + + def test_split_and_merge_lod_tensor_2(self): + self.split_and_merge_lod_tensor_level_0(True) + + def main(self, + tensor, + mask, + expect_true, + expect_false, + expect_out, + level=0, + use_merge_lod_infer=False): place = self.place() program = Program() with program_guard(program): @@ -103,11 +117,36 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): out_true, out_false = split_lod_tensor(input=x, mask=y, level=level) out_true.persistable = True out_false.persistable = True - - out = merge_lod_tensor( - in_true=out_true, in_false=out_false, mask=y, x=x, level=level) - - out.persistable = True + if use_merge_lod_infer: + input_dict = { + 'X': x, + 'Mask': mask, + 'InTrue': out_true, + 'InFalse': out_false, + 'level': level + } + helper = LayerHelper('merge_lod_tensor_infer') + out = helper.create_variable_for_type_inference( + dtype=out_true.dtype) + helper.append_op( + type='merge_lod_tensor_infer', + inputs={ + 'X': x, + 'Mask': y, + 'InTrue': out_true, + 'InFalse': out_false + }, + outputs={'Out': out}, + attrs={'level': level}) + out.persistable = True + else: + out = merge_lod_tensor( + in_true=out_true, + in_false=out_false, + mask=y, + x=x, + level=level) + out.persistable = True exe = Executor(place) scope = core.Scope() @@ -122,9 +161,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase): var_false = scope.find_var(out_false.name).get_tensor() var_out = scope.find_var(out.name).get_tensor() - - self.check_tensor_same(var_true, expect_true) - self.check_tensor_same(var_false, expect_false) + if not use_merge_lod_infer: + self.check_tensor_same(var_true, expect_true) + self.check_tensor_same(var_false, expect_false) self.check_tensor_same(var_out, expect_out) def check_tensor_same(self, actual, expect): -- GitLab