diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index e74154698b59a6de51aed929bf18b1f0f707043f..d19b163817e41e9b2fad42916d84778192edcb08 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -157,7 +157,6 @@ pass_library(layer_norm_fuse_pass inference) pass_library(add_support_int8_pass inference) pass_library(matmul_scale_fuse_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference) -pass_library(mixed_precision_configure_pass inference) pass_library(dense_fc_to_sparse_pass inference) pass_library(dense_multihead_matmul_to_sparse_pass inference) pass_library(generate_pass DEPS pass_desc_proto) diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc deleted file mode 100644 index 80f201d2d5afce4aa54450f6733c23558e83d3f1..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h" - -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace paddle { -namespace framework { -namespace ir { - -void MixedPrecisionConfigurePass::InsertCastOps( - Graph* graph, const StringSet& blacklist) const { - VLOG(3) << "Insert the cast op before and after the kernel that does not " - "supports fp16 precision"; - - auto update_cast_desc = [&](framework::OpDesc& desc, - const std::string& x_name, - const std::string& out_name, const int in_dtype, - const int out_dtype) { - desc.SetType("cast"); - desc.SetInput("X", {x_name}); - desc.SetOutput("Out", {out_name}); - desc.SetAttr("in_dtype", in_dtype); - desc.SetAttr("out_dtype", out_dtype); - desc.SetAttr("use_mkldnn", false); - desc.SetAttr("with_quant_attr", false); - desc.Flush(); - }; - - auto cast_input = [&](Graph* graph, Node* op_node, - const StringSet& cast_list) { - auto inlinks = op_node->inputs; - for (auto* pre_node : inlinks) { - if (pre_node->IsVar()) { - const auto is_persistable = pre_node->Var()->Persistable(); - const auto is_float = - pre_node->Var()->GetDataType() == proto::VarType::FP16 || - pre_node->Var()->GetDataType() == proto::VarType::FP32 || - pre_node->Var()->GetDataType() == proto::VarType::FP64; - if (!is_persistable && is_float) { - int suffix = 0; - for (auto* pre_node_input : pre_node->inputs) { - if (!pre_node_input->IsOp()) continue; - const auto& type = pre_node_input->Op()->Type(); - if (!cast_list.count(type) && type != "cast") { - std::string old_name = pre_node->Name(); - std::string new_name = - old_name + "_cast.tmp_" + std::to_string(suffix); - suffix++; - - framework::OpDesc new_op_desc(op_node->Op()->Block()); - // 4 for fp16, 5 for fp32 - update_cast_desc(new_op_desc, old_name, new_name, 4, 5); - auto* new_op = graph->CreateOpNode(&new_op_desc); - - VarDesc out_var(new_name); - out_var.SetPersistable(false); - auto* node_var = graph->CreateVarNode(&out_var); - - op_node->Op()->RenameInput(old_name, new_name); - IR_NODE_LINK_TO(pre_node, new_op); - IR_NODE_LINK_TO(new_op, node_var); - IR_NODE_LINK_TO(node_var, op_node); - } - } - } - } - } - }; - - auto cast_output = [&](Graph* graph, Node* op_node, - const StringSet& cast_list) { - auto outlinks = op_node->outputs; - for (auto* next_node : outlinks) { - if (next_node->IsVar()) { - const auto is_persistable = next_node->Var()->Persistable(); - const auto is_float = - next_node->Var()->GetDataType() == proto::VarType::FP16 || - next_node->Var()->GetDataType() == proto::VarType::FP32 || - next_node->Var()->GetDataType() == proto::VarType::FP64; - if (!is_persistable && is_float) { - int suffix = 0; - for (auto* next_node_output : next_node->outputs) { - if (!next_node_output->IsOp()) continue; - - const auto& type = next_node_output->Op()->Type(); - if (!cast_list.count(type) && type != "cast") { - std::string old_name = next_node->Name(); - std::string new_name = - old_name + "_cast.tmp_" + std::to_string(suffix); - suffix++; - - framework::OpDesc new_op_desc(op_node->Op()->Block()); - // 4 for fp16, 5 for fp32 - update_cast_desc(new_op_desc, old_name, new_name, 5, 4); - auto* new_op = graph->CreateOpNode(&new_op_desc); - - VarDesc out_var(new_name); - out_var.SetPersistable(false); - auto* node_var = graph->CreateVarNode(&out_var); - - next_node_output->Op()->RenameInput(old_name, new_name); - IR_NODE_LINK_TO(next_node, new_op); - IR_NODE_LINK_TO(new_op, node_var); - IR_NODE_LINK_TO(node_var, next_node_output); - } - } - } - } - } - }; - - for (auto* op_node : - ir::TopologyVarientSort(*graph, static_cast(0))) { - if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || - op_node->Op()->Type() == "fetch") - continue; - - const auto& type = op_node->Op()->Type(); - if (blacklist.count(type)) { - cast_input(graph, op_node, blacklist); - cast_output(graph, op_node, blacklist); - } - } -} - -void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const { - const auto blacklist = - Get>("gpu_fp16_disabled_op_types"); - InsertCastOps(graph, blacklist); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(mixed_precision_configure_pass, - paddle::framework::ir::MixedPrecisionConfigurePass); diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h deleted file mode 100644 index fc5a612ecb833d2a5117a2dab58747d21226df8d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" - -namespace paddle { -namespace framework { -namespace ir { - -using StringSet = std::unordered_set; - -class MixedPrecisionConfigurePass : public FusePassBase { - public: - MixedPrecisionConfigurePass() = default; - virtual ~MixedPrecisionConfigurePass() {} - - protected: - void ApplyImpl(Graph* graph) const override; - - private: - void InsertCastOps(Graph* graph, const StringSet& blacklist) const; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 07b7b37485956a2ecfc150bdc00c56c346f7d96b..588a304108f78721d0f3a8d2f3ebcbf01b473618 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -80,7 +80,8 @@ struct Argument { public: \ type__& field__() { \ PADDLE_ENFORCE_EQ( \ - Has(#field__), true, \ + Has(#field__), \ + true, \ platform::errors::PreconditionNotMet("There is no such field")); \ return field__##_; \ } \ @@ -97,41 +98,45 @@ struct Argument { #define DECL_ARGUMENT_FIELD_VALID(field__) \ bool field__##_valid() { return Has(#field__); } -#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__) \ - public: \ - type__& field__() { \ - PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \ - "filed should not be null.")); \ - PADDLE_ENFORCE_EQ( \ - Has(#field__), true, \ - platform::errors::PreconditionNotMet("There is no such field")); \ - return *static_cast(field__##_.get()); \ - } \ - void Set##Field(type__* x) { \ - field__##_ = \ - unique_ptr_t(x, [](void* x) { delete static_cast(x); }); \ - valid_fields_.insert(#field__); \ - } \ - void Set##Field##NotOwned(type__* x) { \ - valid_fields_.insert(#field__); \ - field__##_ = unique_ptr_t(x, [](void* x) {}); \ - } \ - DECL_ARGUMENT_FIELD_VALID(field__); \ - type__* field__##_ptr() { \ - PADDLE_ENFORCE_EQ( \ - Has(#field__), true, \ - platform::errors::PreconditionNotMet("There is no such field")); \ - return static_cast(field__##_.get()); \ - } \ - type__* Release##Field() { \ - PADDLE_ENFORCE_EQ( \ - Has(#field__), true, \ - platform::errors::PreconditionNotMet("There is no such field")); \ - valid_fields_.erase(#field__); \ - return static_cast(field__##_.release()); \ - } \ - \ - private: \ +#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__) \ + public: \ + type__& field__() { \ + PADDLE_ENFORCE_NOT_NULL( \ + field__##_, \ + platform::errors::PreconditionNotMet("filed should not be null.")); \ + PADDLE_ENFORCE_EQ( \ + Has(#field__), \ + true, \ + platform::errors::PreconditionNotMet("There is no such field")); \ + return *static_cast(field__##_.get()); \ + } \ + void Set##Field(type__* x) { \ + field__##_ = \ + unique_ptr_t(x, [](void* x) { delete static_cast(x); }); \ + valid_fields_.insert(#field__); \ + } \ + void Set##Field##NotOwned(type__* x) { \ + valid_fields_.insert(#field__); \ + field__##_ = unique_ptr_t(x, [](void* x) {}); \ + } \ + DECL_ARGUMENT_FIELD_VALID(field__); \ + type__* field__##_ptr() { \ + PADDLE_ENFORCE_EQ( \ + Has(#field__), \ + true, \ + platform::errors::PreconditionNotMet("There is no such field")); \ + return static_cast(field__##_.get()); \ + } \ + type__* Release##Field() { \ + PADDLE_ENFORCE_EQ( \ + Has(#field__), \ + true, \ + platform::errors::PreconditionNotMet("There is no such field")); \ + valid_fields_.erase(#field__); \ + return static_cast(field__##_.release()); \ + } \ + \ + private: \ unique_ptr_t field__##_; DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int); @@ -153,34 +158,40 @@ struct Argument { DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); // The ir passes to perform in analysis phase. - DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, + DECL_ARGUMENT_FIELD(ir_analysis_passes, + IrAnalysisPasses, std::vector); - DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses, + DECL_ARGUMENT_FIELD(analysis_passes, + AnalysisPasses, std::vector); // whether to mute all logs in inference. DECL_ARGUMENT_FIELD(disable_logs, DisableLogs, bool); // Pass a set of op types to enable its mkldnn kernel - DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, + DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, + MKLDNNEnabledOpTypes, std::unordered_set); // The cache capacity of different input shapes for mkldnn. DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int); #ifdef PADDLE_WITH_MKLDNN // A set of op types to enable their quantized kernels - DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes, + DECL_ARGUMENT_FIELD(quantize_enabled_op_types, + QuantizeEnabledOpTypes, std::unordered_set); // A set of op IDs to exclude from enabling their quantized kernels - DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds, + DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, + QuantizeExcludedOpIds, std::unordered_set); // Scales for variables to be quantized DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); // A set of op types to enable their bfloat16 kernels - DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, Bfloat16EnabledOpTypes, + DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, + Bfloat16EnabledOpTypes, std::unordered_set); DECL_ARGUMENT_FIELD(use_mkldnn_int8, UseMkldnnInt8, bool); @@ -190,9 +201,6 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); - DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool); - DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes, - std::unordered_set); // Usually use for trt dynamic shape. // TRT will select the best kernel according to opt shape @@ -209,25 +217,33 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); - DECL_ARGUMENT_FIELD(tensorrt_disabled_ops, TensorRtDisabledOPs, + DECL_ARGUMENT_FIELD(tensorrt_disabled_ops, + TensorRtDisabledOPs, std::vector); - DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, + DECL_ARGUMENT_FIELD(tensorrt_precision_mode, + TensorRtPrecisionMode, AnalysisConfig::Precision); - DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, + DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, + TensorRtUseStaticEngine, bool); DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool); DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool); - DECL_ARGUMENT_FIELD(tensorrt_transformer_posid, TensorRtTransformerPosid, + DECL_ARGUMENT_FIELD(tensorrt_transformer_posid, + TensorRtTransformerPosid, std::string); - DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid, TensorRtTransformerMaskid, + DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid, + TensorRtTransformerMaskid, std::string); DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path, - TensorRtShapeRangeInfoPath, std::string); - DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape, + TensorRtShapeRangeInfoPath, + std::string); + DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, + TensorRtTunedDynamicShape, bool); DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime, - TensorRtAllowBuildAtRuntime, bool); + TensorRtAllowBuildAtRuntime, + bool); DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool); DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); @@ -235,10 +251,12 @@ struct Argument { DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int); DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int); - DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter, + DECL_ARGUMENT_FIELD(lite_passes_filter, + LitePassesFilter, std::vector); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector); - DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode, + DECL_ARGUMENT_FIELD(lite_precision_mode, + LitePrecisionMode, AnalysisConfig::Precision); DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool); @@ -252,19 +270,26 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); - DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, NNAdapterModelCacheDir, + DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, + NNAdapterModelCacheDir, std::string); - DECL_ARGUMENT_FIELD(nnadapter_device_names, NNAdapterDeviceNames, + DECL_ARGUMENT_FIELD(nnadapter_device_names, + NNAdapterDeviceNames, std::vector); - DECL_ARGUMENT_FIELD(nnadapter_context_properties, NNAdapterContextProperties, + DECL_ARGUMENT_FIELD(nnadapter_context_properties, + NNAdapterContextProperties, std::string); DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_buffer, - NNAdapterSubgraphPartitionConfigBuffer, std::string); + NNAdapterSubgraphPartitionConfigBuffer, + std::string); DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_path, - NNAdapterSubgraphPartitionConfigPath, std::string); - DECL_ARGUMENT_FIELD(nnadapter_model_cache_token, NNAdapterModelCacheToken, + NNAdapterSubgraphPartitionConfigPath, + std::string); + DECL_ARGUMENT_FIELD(nnadapter_model_cache_token, + NNAdapterModelCacheToken, std::vector); - DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer, NNAdapterModelCacheBuffer, + DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer, + NNAdapterModelCacheBuffer, std::vector>); // Memory optimized related. @@ -275,13 +300,15 @@ struct Argument { DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); // The program transformed by IR analysis phase. - DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, + DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, + IrAnalyzedProgram, framework::proto::ProgramDesc); DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); // Only used in paddle-lite subgraph. - DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads, + DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, + CpuMathLibraryNumThreads, int); // ipu related @@ -293,7 +320,8 @@ struct Argument { DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool); DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int); DECL_ARGUMENT_FIELD(ipu_available_memory_proportion, - IpuAvailableMemoryProportion, float); + IpuAvailableMemoryProportion, + float); DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool); // npu related @@ -306,7 +334,8 @@ struct Argument { #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \ PADDLE_ENFORCE_EQ( \ - argument__->Has(#fieldname__), true, \ + argument__->Has(#fieldname__), \ + true, \ platform::errors::PreconditionNotMet( \ "the argument field [%s] should be set", #fieldname__)); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 6c74d7b738cf678dcbf38cdcc3f8a734152a0c35..49878884ac6276034f1c77eb0c144ed1fc54710f 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -68,12 +68,15 @@ void IRPassManager::CreatePasses(Argument *argument, auto precision_mode = argument->tensorrt_precision_mode(); bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; pass->Set("enable_int8", new bool(enable_int8)); - pass->Set("max_input_shape", new std::map>( - argument->max_input_shape())); - pass->Set("min_input_shape", new std::map>( - argument->min_input_shape())); - pass->Set("optim_input_shape", new std::map>( - argument->optim_input_shape())); + pass->Set("max_input_shape", + new std::map>( + argument->max_input_shape())); + pass->Set("min_input_shape", + new std::map>( + argument->min_input_shape())); + pass->Set("optim_input_shape", + new std::map>( + argument->optim_input_shape())); // tuned trt dynamic_shape pass->Set("trt_tuned_dynamic_shape", new bool(argument->tensorrt_tuned_dynamic_shape())); @@ -143,14 +146,16 @@ void IRPassManager::CreatePasses(Argument *argument, bool int8_valid = !(model_from_memory && optim_cache_dir.empty() && enable_int8 && use_calib_mode); PADDLE_ENFORCE_EQ( - int8_valid, true, + int8_valid, + true, platform::errors::PreconditionNotMet( "When you are in TRT INT8 mode, and load model from " "memory, you should set optim_cache_dir using " "config.SetOptimCacheDir()")); if (model_from_memory && use_static_engine) { PADDLE_ENFORCE_EQ( - optim_cache_dir.empty(), false, + optim_cache_dir.empty(), + false, platform::errors::PreconditionNotMet( "When you are using Paddle-TRT, and using load model " "from memory, and also set the use_static to true. " @@ -161,7 +166,8 @@ void IRPassManager::CreatePasses(Argument *argument, if (!optim_cache_dir.empty()) { if (!PathExists(optim_cache_dir)) { PADDLE_ENFORCE_NE( - MKDIR(optim_cache_dir.c_str()), -1, + MKDIR(optim_cache_dir.c_str()), + -1, platform::errors::PreconditionNotMet( "Can not create optimize cache directory: %s, Make sure you " "have permission to write", @@ -187,8 +193,9 @@ void IRPassManager::CreatePasses(Argument *argument, new std::string(argument->tensorrt_shape_range_info_path())); pass->Set("trt_allow_build_at_runtime", new bool(argument->tensorrt_allow_build_at_runtime())); - pass->Set("trt_disabled_ops", new std::vector( - argument->tensorrt_disabled_ops())); + pass->Set( + "trt_disabled_ops", + new std::vector(argument->tensorrt_disabled_ops())); pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla())); pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core())); // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will @@ -200,10 +207,6 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->dlnne_min_subgraph_size())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); - } else if (pass_name == "mixed_precision_configure_pass") { - pass->Set("gpu_fp16_disabled_op_types", - new std::unordered_set( - argument->gpu_fp16_disabled_op_types())); } if (pass_name == "lite_subgraph_pass") { bool lite_enable_int8 = @@ -272,8 +275,9 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { if (passes_.empty()) { return graph; } - PADDLE_ENFORCE_NOT_NULL(graph.get(), platform::errors::PreconditionNotMet( - "Graph cannot be NULL.")); + PADDLE_ENFORCE_NOT_NULL( + graph.get(), + platform::errors::PreconditionNotMet("Graph cannot be NULL.")); // Apply all the passes for (const auto &pass : passes_) { if (pass->Type() != "graph_viz_pass" && !disable_logs_) { diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index a0c7a94cd1b30107e7556ff586d1e90b7b8774d4..a785aba4bb40bc6483f84b909ac947543e1a1fc2 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" @@ -37,7 +36,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { LOG(INFO) << "Sync params from CPU to NPU"; - PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), true, + PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), + true, platform::errors::PreconditionNotMet( "The npu_device_id field should be valid")); platform::Place place = platform::NPUPlace(argument->npu_device_id()); @@ -46,8 +46,9 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { for (auto &var_name : all_vars) { auto *var = scope->FindLocalVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet( - "The var should not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + var, + platform::errors::PreconditionNotMet("The var should not be nullptr")); if (var->IsType() || var->IsType()) { @@ -67,26 +68,6 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { #else -void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap( - const framework::ir::Graph &graph, - std::unordered_map *var_name_op_type_map) { - std::vector node_list = - framework::ir::TopologyVarientSort( - graph, static_cast(0)); - for (auto *op_node : node_list) { - if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || - op_node->Op()->Type() == "fetch") - continue; - - for (auto *pre_node : op_node->inputs) { - if (pre_node->IsVar() && pre_node->Var()->Persistable()) { - var_name_op_type_map->insert(std::pair( - pre_node->Var()->Name(), op_node->Op()->Type())); - } - } - } -} - void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -100,7 +81,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { LOG(INFO) << "Sync params from CPU to GPU"; - PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true, + PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), + true, platform::errors::PreconditionNotMet( "The gpu_device_id field should be valid")); platform::Place place = platform::CUDAPlace(argument->gpu_device_id()); @@ -124,54 +106,34 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { if (with_dynamic_shape) { reserve_cpu_weights = true; } - - bool mixed_precision_mode = - argument->Has("use_gpu_fp16") && argument->use_gpu_fp16(); - std::unordered_map var_name_op_type_map{}; - std::unordered_set blacklist{}; - if (mixed_precision_mode) { - GetVarNameToOpTypeMap(graph, &var_name_op_type_map); - blacklist = argument->gpu_fp16_disabled_op_types(); - } - for (auto &var_name : all_vars) { - if (std::count(repetitive_params.begin(), repetitive_params.end(), - var_name)) { + if (std::count( + repetitive_params.begin(), repetitive_params.end(), var_name)) { if (!reserve_cpu_weights) { scope->EraseVars({var_name}); } continue; } auto *var = scope->FindLocalVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet( - "The var should not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + var, + platform::errors::PreconditionNotMet("The var should not be nullptr")); if (var->IsType() || var->IsType()) { auto *t = var->GetMutable(); - bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 || - t->dtype() == paddle::experimental::DataType::FLOAT64; - if (mixed_precision_mode && - !blacklist.count(var_name_op_type_map[var_name]) && is_float) { - framework::Tensor half_tensor; - half_tensor.set_type(paddle::experimental::DataType::FLOAT16); - half_tensor.Resize(t->dims()); - auto *half_data = - half_tensor.mutable_data(platform::CPUPlace()); - for (int i = 0; i < t->numel(); i++) { - auto *data = t->mutable_data(platform::CPUPlace()); - half_data[i] = static_cast(data[i]); - } - t->clear(); - paddle::framework::TensorCopySync(half_tensor, place, t); - } else { - platform::CPUPlace cpu_place; - framework::LoDTensor temp_tensor; - temp_tensor.Resize(t->dims()); - paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); - t->clear(); - paddle::framework::TensorCopySync(temp_tensor, place, t); - } + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + temp_tensor.mutable_data(cpu_place); + + // Copy the parameter data to a tmp tensor. + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + // Reallocation the space on GPU + t->clear(); + + // Copy parameter data to newly allocated GPU space. + paddle::framework::TensorCopySync(temp_tensor, place, t); } } } @@ -180,7 +142,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { PADDLE_ENFORCE_EQ( - argument->scope_valid(), true, + argument->scope_valid(), + true, platform::errors::PreconditionNotMet("The scope field should be valid")); #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index f8209f051d53444435ed8c65b400f08bf8627553..d5e98ec886e65f829a1496b1431f23aad6c4bc4c 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -38,12 +38,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { #ifdef PADDLE_WITH_ASCEND_CL void CopyParamsToNpu(Argument *argument); #else - - void GetVarNameToOpTypeMap( - const framework::ir::Graph& graph, - std::unordered_map* var_name_op_type_map); - - void CopyParamsToGpu(Argument* argument); + void CopyParamsToGpu(Argument *argument); #endif }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 9fdc7a93cc27b69e73192c89fd1f43ceec50ec0d..75a5d9ee4f55b9971bdda7d5241a9824135dc1fa 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -84,7 +84,6 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, Update(); } - void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -101,16 +100,18 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, } void AnalysisConfig::SetExecStream(void *stream) { - PADDLE_ENFORCE_NOT_NULL(stream, platform::errors::InvalidArgument( - "`stream` should not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + stream, + platform::errors::InvalidArgument("`stream` should not be nullptr")); exec_stream_ = stream; use_external_stream_ = true; Update(); } void *AnalysisConfig::GetExecStream() const { - PADDLE_ENFORCE_NOT_NULL(exec_stream_, platform::errors::InvalidArgument( - "`stream` should not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + exec_stream_, + platform::errors::InvalidArgument("`stream` should not be nullptr")); return exec_stream_; } @@ -124,27 +125,16 @@ void AnalysisConfig::DisableGpu() { Update(); } -void AnalysisConfig::Exp_EnableUseGpuFp16( - std::unordered_set op_list) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - use_gpu_fp16_ = true; - gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end()); -#else - LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()"; - use_gpu_fp16_ = false; -#endif - - Update(); -} - void AnalysisConfig::DisableFCPadding() { use_fc_padding_ = false; Update(); } -void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, - bool autotune, const std::string &autotune_file, +void AnalysisConfig::EnableXpu(int l3_workspace_size, + bool locked, + bool autotune, + const std::string &autotune_file, const std::string &precision, bool adaptive_seqlen) { use_xpu_ = true; @@ -158,7 +148,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, } void AnalysisConfig::SetXpuDeviceId(int device_id) { - PADDLE_ENFORCE_EQ(use_xpu_, true, + PADDLE_ENFORCE_EQ(use_xpu_, + true, platform::errors::PreconditionNotMet( "Should call EnableXpu before SetXpuDeviceId.")); xpu_device_id_ = device_id; @@ -190,7 +181,8 @@ void AnalysisConfig::EnableCustomDevice(const std::string &device_type, Update(); } -void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size, +void AnalysisConfig::EnableIpu(int ipu_device_num, + int ipu_micro_batch_size, bool ipu_enable_pipelining, int ipu_batches_per_step) { enable_ir_optim_ = true; @@ -204,7 +196,8 @@ void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size, Update(); } -void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, +void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, + int ipu_replica_num, float ipu_available_memory_proportion, bool ipu_enable_half_partial) { ipu_enable_fp16_ = ipu_enable_fp16; @@ -262,8 +255,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_cudnn_); CP_MEMBER(gpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); - CP_MEMBER(use_gpu_fp16_); - CP_MEMBER(gpu_fp16_disabled_op_types_); CP_MEMBER(enable_memory_optim_); // TensorRT related. @@ -366,7 +357,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(custom_device_id_); if (use_gpu_) { - PADDLE_ENFORCE_EQ(use_xpu_, false, + PADDLE_ENFORCE_EQ(use_xpu_, + false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); pass_builder_.reset(new GpuPassStrategy( @@ -406,8 +398,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { std::sort(all_passes.begin(), all_passes.end()); std::sort(other_passes.begin(), other_passes.end()); std::vector deleted_passes; - std::set_difference(all_passes.begin(), all_passes.end(), - other_passes.begin(), other_passes.end(), + std::set_difference(all_passes.begin(), + all_passes.end(), + other_passes.begin(), + other_passes.end(), std::inserter(deleted_passes, deleted_passes.begin())); for (auto ps : deleted_passes) { pass_builder_->DeletePass(ps); @@ -516,8 +510,11 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { } void AnalysisConfig::EnableTensorRtEngine( - int workspace_size, int max_batch_size, int min_subgraph_size, - AnalysisConfig::Precision precision_mode, bool use_static, + int workspace_size, + int max_batch_size, + int min_subgraph_size, + AnalysisConfig::Precision precision_mode, + bool use_static, bool use_calib_mode) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!use_gpu()) { @@ -594,19 +591,22 @@ void AnalysisConfig::Update() { pass_builder_.reset(new IpuPassStrategy); } else if (use_xpu()) { PADDLE_ENFORCE_EQ( - use_gpu(), false, + use_gpu(), + false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); pass_builder_.reset(new XpuPassStrategy); } else if (use_npu()) { PADDLE_ENFORCE_EQ( - use_gpu(), false, + use_gpu(), + false, platform::errors::InvalidArgument( "Only one choice can be made between GPU and NPU.")); pass_builder_.reset(new NpuPassStrategy); } else if (use_custom_device()) { PADDLE_ENFORCE_EQ( - use_gpu(), false, + use_gpu(), + false, platform::errors::InvalidArgument( "Only one choice can be made between GPU and CustomDevice.")); pass_builder_.reset(new CustomDevicePassStrategy); @@ -624,21 +624,24 @@ void AnalysisConfig::Update() { *static_cast(pass_builder_.get()))); } else if (use_xpu()) { PADDLE_ENFORCE_EQ( - use_gpu(), false, + use_gpu(), + false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); pass_builder_.reset(new XpuPassStrategy( *static_cast(pass_builder_.get()))); } else if (use_npu()) { PADDLE_ENFORCE_EQ( - use_gpu(), false, + use_gpu(), + false, platform::errors::InvalidArgument( "Only one choice can be made between GPU and NPU.")); pass_builder_.reset(new NpuPassStrategy( *static_cast(pass_builder_.get()))); } else if (use_custom_device()) { PADDLE_ENFORCE_EQ( - use_gpu(), false, + use_gpu(), + false, platform::errors::InvalidArgument( "Only one choice can be made between GPU and CustomDevice.")); pass_builder_.reset(new CustomDevicePassStrategy( @@ -677,20 +680,6 @@ void AnalysisConfig::Update() { #endif } - if (use_gpu_fp16_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!enable_ir_optim_) { - LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is " - "enabled."; - } else if (!use_gpu()) { - LOG(ERROR) - << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled."; - } else { - pass_builder()->Exp_EnableUseGpuFp16(); - } -#endif - } - if (use_mkldnn_) { #ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { @@ -749,7 +738,8 @@ void AnalysisConfig::Update() { #endif pass_builder()->ClearPasses(); for (const auto &pass : kLiteSubgraphPasses) { - if (std::find(lite_passes_filter_.begin(), lite_passes_filter_.end(), + if (std::find(lite_passes_filter_.begin(), + lite_passes_filter_.end(), pass) == lite_passes_filter_.end()) { pass_builder()->AppendPass(pass); } @@ -758,7 +748,8 @@ void AnalysisConfig::Update() { if (use_xpu_) { #if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU) - PADDLE_ENFORCE_EQ(use_gpu_, false, + PADDLE_ENFORCE_EQ(use_gpu_, + false, platform::errors::Unavailable( "Currently, XPU and GPU cannot be enabled in the " "same analysis configuration.")); @@ -771,7 +762,8 @@ void AnalysisConfig::Update() { if (use_npu_) { #if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU) - PADDLE_ENFORCE_EQ(use_gpu_, false, + PADDLE_ENFORCE_EQ(use_gpu_, + false, platform::errors::Unavailable( "Currently, NPU and GPU cannot be enabled in the " "same analysis configuration.")); @@ -809,8 +801,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_gpu_; ss << use_external_stream_; ss << exec_stream_; - ss << use_gpu_fp16_; - for (auto &item : gpu_fp16_disabled_op_types_) ss << item; ss << use_fc_padding_; ss << gpu_device_id_; ss << xpu_device_id_; @@ -957,7 +947,8 @@ void AnalysisConfig::DisableGlogInfo() { } void AnalysisConfig::EnableLiteEngine( - AnalysisConfig::Precision precision_mode, bool zero_copy, + AnalysisConfig::Precision precision_mode, + bool zero_copy, const std::vector &passes_filter, const std::vector &ops_filter) { use_lite_ = true; @@ -1057,9 +1048,9 @@ std::string AnalysisConfig::Summary() { // dynamic_shape os.InsertRow({"tensorrt_enable_dynamic_shape", min_input_shape_.empty() ? "false" : "true"}); - os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_ - ? shape_range_info_path_ - : "false"}); + os.InsertRow( + {"tensorrt_tuned_dynamic_shape", + trt_tuned_dynamic_shape_ ? shape_range_info_path_ : "false"}); os.InsertRow( {"tensorrt_use_varseqlen", trt_use_varseqlen_ ? "true" : "false"}); @@ -1123,10 +1114,12 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheDir( LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers( const std::string &model_cache_token, const std::vector &model_cache_buffer) { - PADDLE_ENFORCE_EQ(model_cache_token.empty(), false, + PADDLE_ENFORCE_EQ(model_cache_token.empty(), + false, platform::errors::InvalidArgument( "model_cache_token should not be empty.")); - PADDLE_ENFORCE_EQ(model_cache_buffer.empty(), false, + PADDLE_ENFORCE_EQ(model_cache_buffer.empty(), + false, platform::errors::InvalidArgument( "model_cache_buffer should not be empty.")); PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token), @@ -1165,7 +1158,8 @@ void AnalysisConfig::CollectShapeRangeInfo( << "all intermediate tensors in the compute graph and calculate " "the min_shape, max_shape and opt_shape."; collect_shape_range_info_ = true; - PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false, + PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), + false, platform::errors::InvalidArgument( "The shape_range_info_path should not be empty, please " "re-check the argument.")); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index c32edc3650a8df832cc48bbc27f39f0f7c30649d..7cf49e533f7c5e9cc8e2229c996929fbd643b9f2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1048,11 +1048,6 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); } - if (config_.gpu_fp16_enabled()) { - argument_.SetUseGPUFp16(true); - argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_); - } - if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 1e45c24534267144fd736ac7811f59044d669bf2..a7871737ad4b1f03a53b6307d59294f7364d8058 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -371,19 +371,6 @@ TEST(AnalysisPredictor, enable_onnxruntime) { ASSERT_TRUE(!config.use_onnxruntime()); } -TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) { - AnalysisConfig config; - config.SwitchIrOptim(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - config.EnableUseGpu(100, 0); - config.Exp_EnableUseGpuFp16(); - ASSERT_TRUE(config.gpu_fp16_enabled()); -#else - config.DisableGpu(); -#endif - LOG(INFO) << config.Summary(); -} - } // namespace paddle namespace paddle_infer { @@ -443,19 +430,6 @@ TEST(Predictor, EnableONNXRuntime) { auto predictor = CreatePredictor(config); } -TEST(Predictor, Exp_EnableUseGpuFp16) { - Config config; - config.SetModel(FLAGS_dirname); - config.SwitchIrOptim(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - config.EnableUseGpu(100, 0); - config.Exp_EnableUseGpuFp16(); -#else - config.DisableGpu(); -#endif - auto predictor = CreatePredictor(config); -} - TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); @@ -476,8 +450,8 @@ TEST(Tensor, CpuShareExternalData) { auto out = predictor->GetOutputHandle("fc_1.tmp_2"); auto out_shape = out->shape(); std::vector out_data; - out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1, - std::multiplies())); + out_data.resize(std::accumulate( + out_shape.begin(), out_shape.end(), 1, std::multiplies())); out->ShareExternalData(out_data.data(), out_shape, PlaceType::kCPU); predictor->Run(); @@ -507,7 +481,9 @@ TEST(Tensor, GpuShareExternalData) { for (size_t i = 0; i < 4; ++i) { cudaMalloc(reinterpret_cast(&input_gpu[i]), 4 * sizeof(int64_t)); - cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t), + cudaMemcpy(input_gpu[i], + input_data[i].data(), + 4 * sizeof(int64_t), cudaMemcpyHostToDevice); } @@ -519,9 +495,10 @@ TEST(Tensor, GpuShareExternalData) { auto out = predictor->GetOutputHandle("fc_1.tmp_2"); auto out_shape = out->shape(); float* out_data = nullptr; - auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1, - std::multiplies()) * - sizeof(float); + auto out_size = + std::accumulate( + out_shape.begin(), out_shape.end(), 1, std::multiplies()) * + sizeof(float); cudaMalloc(reinterpret_cast(out_data), out_size * sizeof(float)); out->ShareExternalData(out_data, out_shape, PlaceType::kGPU); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index fe82bbf29cbb2fa94c2702f0eeb33803fb3a729d..74a57cbc26040ccf2e1664e556efbb21bde2b534 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -253,19 +253,6 @@ struct PD_INFER_DECL AnalysisConfig { /// /// void DisableGpu(); - /// - /// \brief Enable GPU fp16 precision computation, in experimental state. - /// - /// \param op_list The operator type list. - /// - void Exp_EnableUseGpuFp16(std::unordered_set op_list = {}); - /// - /// \brief A boolean state telling whether the GPU fp16 precision is turned - /// on. - /// - /// \return bool Whether the GPU fp16 precision is turned on. - /// - bool gpu_fp16_enabled() const { return use_gpu_fp16_; } /// /// \brief Turn on XPU. @@ -287,8 +274,10 @@ struct PD_INFER_DECL AnalysisConfig { /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length /// - void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, - bool autotune = true, const std::string& autotune_file = "", + void EnableXpu(int l3_workspace_size = 0xfffc00, + bool locked = false, + bool autotune = true, + const std::string& autotune_file = "", const std::string& precision = "int16", bool adaptive_seqlen = false); @@ -301,7 +290,8 @@ struct PD_INFER_DECL AnalysisConfig { /// \param ipu_enable_pipelining enable pipelining. /// \param ipu_batches_per_step the number of batches per run in pipelining. /// - void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1, + void EnableIpu(int ipu_device_num = 1, + int ipu_micro_batch_size = 1, bool ipu_enable_pipelining = false, int ipu_batches_per_step = 1); @@ -315,7 +305,8 @@ struct PD_INFER_DECL AnalysisConfig { /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work /// with fp16. /// - void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1, + void SetIpuConfig(bool ipu_enable_fp16 = false, + int ipu_replica_num = 1, float ipu_available_memory_proportion = 1.0, bool ipu_enable_half_partial = false); @@ -525,7 +516,8 @@ struct PD_INFER_DECL AnalysisConfig { /// /// void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1, int min_subgraph_size = 3, + int max_batch_size = 1, + int min_subgraph_size = 3, Precision precision = Precision::kFloat32, bool use_static = false, bool use_calib_mode = true); @@ -821,8 +813,10 @@ struct PD_INFER_DECL AnalysisConfig { /// \param params_buffer The memory buffer of the combined parameters file. /// \param params_buffer_size The size of the combined parameters data. /// - void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, - const char* params_buffer, size_t params_buffer_size); + void SetModelBuffer(const char* prog_buffer, + size_t prog_buffer_size, + const char* params_buffer, + size_t params_buffer_size); /// /// \brief A boolean state telling whether the model is set from the CPU /// memory. @@ -929,20 +923,6 @@ struct PD_INFER_DECL AnalysisConfig { int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool thread_local_stream_{false}; - bool use_gpu_fp16_{false}; - std::unordered_set gpu_fp16_disabled_op_types_{ - "conv2d_fusion", - "conv2d", - "roll", - "strided_slice", - "depthwise_conv2d", - "unfold", - "generate_proposals_v2", - "nearest_interp_v2", - "bilinear_interp_v2" - "yolo_box", - "multiclass_nms3", - "matrix_nms"}; bool use_cudnn_{false}; bool use_external_stream_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 5fdf5fb3139d6ce4b110f5536992bfa3ec7a6e88..954a6898781c18719890f8a253430d153b1baf69 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -194,40 +194,6 @@ void GpuPassStrategy::EnableCUDNN() { use_cudnn_ = true; } -void GpuPassStrategy::Exp_EnableUseGpuFp16() { - passes_.assign({ - "is_test_pass", // - "simplify_with_basic_ops_pass", // - "conv_bn_fuse_pass", // - "conv_eltwiseadd_bn_fuse_pass", // - "embedding_eltwise_layernorm_fuse_pass", // - "multihead_matmul_fuse_pass_v2", // - "gpu_cpu_squeeze2_matmul_fuse_pass", // - "gpu_cpu_reshape2_matmul_fuse_pass", // - "gpu_cpu_flatten2_matmul_fuse_pass", // - "gpu_cpu_map_matmul_v2_to_mul_pass", // - "gpu_cpu_map_matmul_v2_to_matmul_pass", // - "gpu_cpu_map_matmul_to_mul_pass", // - // "fc_fuse_pass", // - "fc_elementwise_layernorm_fuse_pass", // -#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be - // guaranteed at least v7 -// cudnn8.0 has memory leak problem in conv + eltwise + act, so we -// disable the pass. -#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100) - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // -#endif - "conv_elementwise_add_fuse_pass", // -#endif // - "transpose_flatten_concat_fuse_pass", // - "mixed_precision_configure_pass", // - "runtime_context_cache_pass" // - }); - - use_gpu_fp16_ = true; -} - void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index f01799c646077862b723937272089d5f8af01ce8..2b6c189cffcf270edb7396900061233a3eff195c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -109,8 +109,11 @@ class PD_INFER_DECL PaddlePassBuilder { protected: /// \cond Protected std::vector analysis_passes_{ - {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass", - "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass", + {"ir_graph_build_pass", + "ir_graph_clean_pass", + "ir_analysis_pass", + "ir_params_sync_among_devices_pass", + "adjust_cudnn_workspace_size_pass", "inference_op_replace_pass"}}; std::vector passes_; /// \endcond @@ -129,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \brief Enable the use of cuDNN kernel. virtual void EnableCUDNN() {} - /// \brief Enable use gpu fp16 kernel. - virtual void Exp_EnableUseGpuFp16() {} - /// \brief Enable the use of MKLDNN. /// The MKLDNN control exists in both CPU and GPU mode, because there can /// still be some CPU kernels running in GPU mode. @@ -150,10 +150,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } - /// \brief Check if we are using gpu fp16 kernel. - /// \return A bool variable implying whether we are in gpu fp16 mode. - bool use_gpu_fp16() const { return use_gpu_fp16_; } - /// \brief Check if we are using xpu. /// \return A bool variable implying whether we are in xpu mode. bool use_xpu() const { return use_xpu_; } @@ -180,7 +176,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { bool use_npu_{false}; bool use_ipu_{false}; bool use_mkldnn_{false}; - bool use_gpu_fp16_{false}; bool use_custom_device_{false}; /// \endcond }; @@ -248,9 +243,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \brief Enable the use of cuDNN kernel. void EnableCUDNN() override; - /// \brief Enable the use of gpu fp16 kernel. - void Exp_EnableUseGpuFp16() override; - /// \brief Not supported in GPU mode yet. void EnableMKLDNN() override; @@ -269,7 +261,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { protected: /// \cond Protected bool use_cudnn_{false}; - bool use_gpu_fp16_{false}; /// \endcond }; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index d6ffbf010016a997b258038f713c286cca4c340d..2461a9c9525bbe738c7e87452ceb6819752bc9b5 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -113,7 +113,8 @@ template PaddleBuf PaddleBufCreate( py::array_t data) { PaddleBuf buf(data.size() * sizeof(T)); - std::copy_n(static_cast(data.data()), data.size(), + std::copy_n(static_cast(data.data()), + data.size(), static_cast(buf.data())); return buf; } @@ -123,7 +124,8 @@ void PaddleBufReset( PaddleBuf &buf, // NOLINT py::array_t data) { // NOLINT buf.Resize(data.size() * sizeof(T)); - std::copy_n(static_cast(data.data()), data.size(), + std::copy_n(static_cast(data.data()), + data.size(), static_cast(buf.data())); } @@ -131,12 +133,14 @@ template PaddleTensor PaddleTensorCreate( py::array_t data, const std::string name = "", - const std::vector> &lod = {}, bool copy = true) { + const std::vector> &lod = {}, + bool copy = true) { PaddleTensor tensor; if (copy) { PaddleBuf buf(data.size() * sizeof(T)); - std::copy_n(static_cast(data.data()), data.size(), + std::copy_n(static_cast(data.data()), + data.size(), static_cast(buf.data())); tensor.data = std::move(buf); } else { @@ -235,11 +239,13 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor, // NOLINT } if (input_tensor.dtype() == phi::DataType::FLOAT32) { tensor.ShareExternalData( - static_cast(input_tensor.data()), shape, + static_cast(input_tensor.data()), + shape, ToPaddleInferPlace(input_tensor.place().GetType())); } else if (input_tensor.dtype() == phi::DataType::FLOAT16) { tensor.ShareExternalData( - static_cast(input_tensor.data()), shape, + static_cast(input_tensor.data()), + shape, ToPaddleInferPlace(input_tensor.place().GetType())); } } @@ -379,9 +385,11 @@ void BindInferenceApi(py::module *m) { BindMkldnnQuantizerConfig(m); #endif m->def("create_paddle_predictor", - &paddle::CreatePaddlePredictor, py::arg("config")); + &paddle::CreatePaddlePredictor, + py::arg("config")); m->def("create_paddle_predictor", - &paddle::CreatePaddlePredictor, py::arg("config")); + &paddle::CreatePaddlePredictor, + py::arg("config")); m->def("create_predictor", [](const paddle_infer::Config &config) -> std::unique_ptr { @@ -478,15 +486,18 @@ void BindPaddleBuf(py::module *m) { void BindPaddleTensor(py::module *m) { py::class_(*m, "PaddleTensor") .def(py::init<>()) - .def(py::init(&PaddleTensorCreate), py::arg("data"), + .def(py::init(&PaddleTensorCreate), + py::arg("data"), py::arg("name") = "", py::arg("lod") = std::vector>(), py::arg("copy") = true) - .def(py::init(&PaddleTensorCreate), py::arg("data"), + .def(py::init(&PaddleTensorCreate), + py::arg("data"), py::arg("name") = "", py::arg("lod") = std::vector>(), py::arg("copy") = true) - .def(py::init(&PaddleTensorCreate), py::arg("data"), + .def(py::init(&PaddleTensorCreate), + py::arg("data"), py::arg("name") = "", py::arg("lod") = std::vector>(), py::arg("copy") = true) @@ -563,7 +574,8 @@ void BindNativePredictor(py::module *m) { .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor) .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun) .def("clone", &NativePaddlePredictor::Clone) - .def("scope", &NativePaddlePredictor::scope, + .def("scope", + &NativePaddlePredictor::scope, py::return_value_policy::reference); } @@ -581,8 +593,9 @@ void BindAnalysisConfig(py::module *m) { .def(py::init()) .def(py::init()) .def("summary", &AnalysisConfig::Summary) - .def("set_model", (void(AnalysisConfig::*)(const std::string &)) & - AnalysisConfig::SetModel) + .def("set_model", + (void(AnalysisConfig::*)(const std::string &)) & + AnalysisConfig::SetModel) .def("set_model", (void(AnalysisConfig::*)(const std::string &, const std::string &)) & AnalysisConfig::SetModel) @@ -591,25 +604,32 @@ void BindAnalysisConfig(py::module *m) { .def("model_dir", &AnalysisConfig::model_dir) .def("prog_file", &AnalysisConfig::prog_file) .def("params_file", &AnalysisConfig::params_file) - .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, - py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) - .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16, - py::arg("gpu_fp16_disabled_op_types") = - std::unordered_set({})) - .def("enable_xpu", &AnalysisConfig::EnableXpu, + .def("enable_use_gpu", + &AnalysisConfig::EnableUseGpu, + py::arg("memory_pool_init_size_mb"), + py::arg("device_id") = 0) + .def("enable_xpu", + &AnalysisConfig::EnableXpu, py::arg("l3_workspace_size") = 16 * 1024 * 1024, - py::arg("locked") = false, py::arg("autotune") = true, - py::arg("autotune_file") = "", py::arg("precision") = "int16", + py::arg("locked") = false, + py::arg("autotune") = true, + py::arg("autotune_file") = "", + py::arg("precision") = "int16", py::arg("adaptive_seqlen") = false) - .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId, + .def("set_xpu_device_id", + &AnalysisConfig::SetXpuDeviceId, py::arg("device_id") = 0) .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0) - .def("enable_ipu", &AnalysisConfig::EnableIpu, - py::arg("ipu_device_num") = 1, py::arg("ipu_micro_batch_size") = 1, + .def("enable_ipu", + &AnalysisConfig::EnableIpu, + py::arg("ipu_device_num") = 1, + py::arg("ipu_micro_batch_size") = 1, py::arg("ipu_enable_pipelining") = false, py::arg("ipu_batches_per_step") = 1) - .def("set_ipu_config", &AnalysisConfig::SetIpuConfig, - py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1, + .def("set_ipu_config", + &AnalysisConfig::SetIpuConfig, + py::arg("ipu_enable_fp16") = false, + py::arg("ipu_replica_num") = 1, py::arg("ipu_available_memory_proportion") = 1.0, py::arg("ipu_enable_half_partial") = false) .def("disable_gpu", &AnalysisConfig::DisableGpu) @@ -627,27 +647,34 @@ void BindAnalysisConfig(py::module *m) { &AnalysisConfig::memory_pool_init_size_mb) .def("fraction_of_gpu_memory_for_pool", &AnalysisConfig::fraction_of_gpu_memory_for_pool) - .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim, + .def("switch_ir_optim", + &AnalysisConfig::SwitchIrOptim, py::arg("x") = true) .def("ir_optim", &AnalysisConfig::ir_optim) - .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim, + .def("enable_memory_optim", + &AnalysisConfig::EnableMemoryOptim, py::arg("x") = true) .def("enable_profile", &AnalysisConfig::EnableProfile) .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo) .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled) .def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir) - .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps, + .def("switch_use_feed_fetch_ops", + &AnalysisConfig::SwitchUseFeedFetchOps, py::arg("x") = true) .def("use_feed_fetch_ops_enabled", &AnalysisConfig::use_feed_fetch_ops_enabled) .def("switch_specify_input_names", - &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true) + &AnalysisConfig::SwitchSpecifyInputNames, + py::arg("x") = true) .def("specify_input_name", &AnalysisConfig::specify_input_name) - .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, - py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, + .def("enable_tensorrt_engine", + &AnalysisConfig::EnableTensorRtEngine, + py::arg("workspace_size") = 1 << 20, + py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, - py::arg("use_static") = false, py::arg("use_calib_mode") = true) + py::arg("use_static") = false, + py::arg("use_calib_mode") = true) .def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode) .def("set_trt_dynamic_shape_info", &AnalysisConfig::SetTRTDynamicShapeInfo, @@ -674,7 +701,8 @@ void BindAnalysisConfig(py::module *m) { .def("trt_allow_build_at_runtime", &AnalysisConfig::trt_allow_build_at_runtime) .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs) - .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA, + .def("enable_tensorrt_dla", + &AnalysisConfig::EnableTensorRtDLA, py::arg("dla_core") = 0) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("enable_tensorrt_inspector", @@ -682,15 +710,18 @@ void BindAnalysisConfig(py::module *m) { .def("tensorrt_inspector_enabled", &AnalysisConfig::tensorrt_inspector_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) - .def("enable_dlnne", &AnalysisConfig::EnableDlnne, + .def("enable_dlnne", + &AnalysisConfig::EnableDlnne, py::arg("min_subgraph_size") = 3) - .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, + .def("enable_lite_engine", + &AnalysisConfig::EnableLiteEngine, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("zero_copy") = false, py::arg("passes_filter") = std::vector(), py::arg("ops_filter") = std::vector()) .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled) - .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, + .def("switch_ir_debug", + &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN) .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled) @@ -702,12 +733,15 @@ void BindAnalysisConfig(py::module *m) { .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer) .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16) #ifdef PADDLE_WITH_MKLDNN - .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config, + .def("quantizer_config", + &AnalysisConfig::mkldnn_quantizer_config, py::return_value_policy::reference) - .def("set_mkldnn_cache_capacity", &AnalysisConfig::SetMkldnnCacheCapacity, + .def("set_mkldnn_cache_capacity", + &AnalysisConfig::SetMkldnnCacheCapacity, py::arg("capacity") = 0) .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op) - .def("enable_mkldnn_int8", &AnalysisConfig::EnableMkldnnInt8, + .def("enable_mkldnn_int8", + &AnalysisConfig::EnableMkldnnInt8, py::arg("mkldnn_int8_enabled_op_types") = std::unordered_set({})) .def("mkldnn_int8_enabled", &AnalysisConfig::mkldnn_int8_enabled) @@ -807,17 +841,20 @@ void BindAnalysisPredictor(py::module *m) { .def("prepare_argument", &AnalysisPredictor::PrepareArgument) .def("optimize_inference_program", &AnalysisPredictor::OptimizeInferenceProgram) - .def("analysis_argument", &AnalysisPredictor::analysis_argument, + .def("analysis_argument", + &AnalysisPredictor::analysis_argument, py::return_value_policy::reference) .def("clone", &AnalysisPredictor::Clone) - .def("scope", &AnalysisPredictor::scope, + .def("scope", + &AnalysisPredictor::scope, py::return_value_policy::reference) - .def("program", &AnalysisPredictor::program, + .def("program", + &AnalysisPredictor::program, py::return_value_policy::reference) .def("get_serialized_program", &AnalysisPredictor::GetSerializedProgram) .def("mkldnn_quantize", &AnalysisPredictor::MkldnnQuantize) - .def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel, - py::arg("dir")); + .def( + "SaveOptimModel", &AnalysisPredictor::SaveOptimModel, py::arg("dir")); } void BindPaddleInferPredictor(py::module *m) { @@ -842,10 +879,12 @@ void BindPaddleInferPredictor(py::module *m) { void BindZeroCopyTensor(py::module *m) { py::class_(*m, "ZeroCopyTensor") - .def("reshape", py::overload_cast &>( - &ZeroCopyTensor::Reshape)) - .def("reshape", py::overload_cast( - &paddle_infer::Tensor::ReshapeStrings)) + .def( + "reshape", + py::overload_cast &>(&ZeroCopyTensor::Reshape)) + .def("reshape", + py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) @@ -860,10 +899,12 @@ void BindZeroCopyTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) { py::class_(*m, "PaddleInferTensor") - .def("reshape", py::overload_cast &>( - &paddle_infer::Tensor::Reshape)) - .def("reshape", py::overload_cast( - &paddle_infer::Tensor::ReshapeStrings)) + .def("reshape", + py::overload_cast &>( + &paddle_infer::Tensor::Reshape)) + .def("reshape", + py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) @@ -881,7 +922,8 @@ void BindPaddleInferTensor(py::module *m) { void BindPredictorPool(py::module *m) { py::class_(*m, "PredictorPool") .def(py::init()) - .def("retrive", &paddle_infer::services::PredictorPool::Retrive, + .def("retrive", + &paddle_infer::services::PredictorPool::Retrive, py::return_value_policy::reference); } @@ -904,7 +946,8 @@ void BindPaddlePassBuilder(py::module *m) { .def("append_analysis_pass", &PaddlePassBuilder::AppendAnalysisPass) .def("turn_on_debug", &PaddlePassBuilder::TurnOnDebug) .def("debug_string", &PaddlePassBuilder::DebugString) - .def("all_passes", &PaddlePassBuilder::AllPasses, + .def("all_passes", + &PaddlePassBuilder::AllPasses, py::return_value_policy::reference) .def("analysis_passes", &PaddlePassBuilder::AnalysisPasses);