未验证 提交 7985407b 编写于 作者: W Wilber 提交者: GitHub

revert 40531 (#43807)

* revert 40531

* update
上级 69717717
...@@ -157,7 +157,6 @@ pass_library(layer_norm_fuse_pass inference) ...@@ -157,7 +157,6 @@ pass_library(layer_norm_fuse_pass inference)
pass_library(add_support_int8_pass inference) pass_library(add_support_int8_pass inference)
pass_library(matmul_scale_fuse_pass inference) pass_library(matmul_scale_fuse_pass inference)
pass_library(gpu_cpu_map_matmul_to_mul_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
pass_library(mixed_precision_configure_pass inference)
pass_library(dense_fc_to_sparse_pass inference) pass_library(dense_fc_to_sparse_pass inference)
pass_library(dense_multihead_matmul_to_sparse_pass inference) pass_library(dense_multihead_matmul_to_sparse_pass inference)
pass_library(generate_pass DEPS pass_desc_proto) pass_library(generate_pass DEPS pass_desc_proto)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
namespace ir {
void MixedPrecisionConfigurePass::InsertCastOps(
Graph* graph, const StringSet& blacklist) const {
VLOG(3) << "Insert the cast op before and after the kernel that does not "
"supports fp16 precision";
auto update_cast_desc = [&](framework::OpDesc& desc,
const std::string& x_name,
const std::string& out_name, const int in_dtype,
const int out_dtype) {
desc.SetType("cast");
desc.SetInput("X", {x_name});
desc.SetOutput("Out", {out_name});
desc.SetAttr("in_dtype", in_dtype);
desc.SetAttr("out_dtype", out_dtype);
desc.SetAttr("use_mkldnn", false);
desc.SetAttr("with_quant_attr", false);
desc.Flush();
};
auto cast_input = [&](Graph* graph, Node* op_node,
const StringSet& cast_list) {
auto inlinks = op_node->inputs;
for (auto* pre_node : inlinks) {
if (pre_node->IsVar()) {
const auto is_persistable = pre_node->Var()->Persistable();
const auto is_float =
pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
pre_node->Var()->GetDataType() == proto::VarType::FP64;
if (!is_persistable && is_float) {
int suffix = 0;
for (auto* pre_node_input : pre_node->inputs) {
if (!pre_node_input->IsOp()) continue;
const auto& type = pre_node_input->Op()->Type();
if (!cast_list.count(type) && type != "cast") {
std::string old_name = pre_node->Name();
std::string new_name =
old_name + "_cast.tmp_" + std::to_string(suffix);
suffix++;
framework::OpDesc new_op_desc(op_node->Op()->Block());
// 4 for fp16, 5 for fp32
update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
auto* new_op = graph->CreateOpNode(&new_op_desc);
VarDesc out_var(new_name);
out_var.SetPersistable(false);
auto* node_var = graph->CreateVarNode(&out_var);
op_node->Op()->RenameInput(old_name, new_name);
IR_NODE_LINK_TO(pre_node, new_op);
IR_NODE_LINK_TO(new_op, node_var);
IR_NODE_LINK_TO(node_var, op_node);
}
}
}
}
}
};
auto cast_output = [&](Graph* graph, Node* op_node,
const StringSet& cast_list) {
auto outlinks = op_node->outputs;
for (auto* next_node : outlinks) {
if (next_node->IsVar()) {
const auto is_persistable = next_node->Var()->Persistable();
const auto is_float =
next_node->Var()->GetDataType() == proto::VarType::FP16 ||
next_node->Var()->GetDataType() == proto::VarType::FP32 ||
next_node->Var()->GetDataType() == proto::VarType::FP64;
if (!is_persistable && is_float) {
int suffix = 0;
for (auto* next_node_output : next_node->outputs) {
if (!next_node_output->IsOp()) continue;
const auto& type = next_node_output->Op()->Type();
if (!cast_list.count(type) && type != "cast") {
std::string old_name = next_node->Name();
std::string new_name =
old_name + "_cast.tmp_" + std::to_string(suffix);
suffix++;
framework::OpDesc new_op_desc(op_node->Op()->Block());
// 4 for fp16, 5 for fp32
update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
auto* new_op = graph->CreateOpNode(&new_op_desc);
VarDesc out_var(new_name);
out_var.SetPersistable(false);
auto* node_var = graph->CreateVarNode(&out_var);
next_node_output->Op()->RenameInput(old_name, new_name);
IR_NODE_LINK_TO(next_node, new_op);
IR_NODE_LINK_TO(new_op, node_var);
IR_NODE_LINK_TO(node_var, next_node_output);
}
}
}
}
}
};
for (auto* op_node :
ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
op_node->Op()->Type() == "fetch")
continue;
const auto& type = op_node->Op()->Type();
if (blacklist.count(type)) {
cast_input(graph, op_node, blacklist);
cast_output(graph, op_node, blacklist);
}
}
}
void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
const auto blacklist =
Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
InsertCastOps(graph, blacklist);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(mixed_precision_configure_pass,
paddle::framework::ir::MixedPrecisionConfigurePass);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
using StringSet = std::unordered_set<std::string>;
class MixedPrecisionConfigurePass : public FusePassBase {
public:
MixedPrecisionConfigurePass() = default;
virtual ~MixedPrecisionConfigurePass() {}
protected:
void ApplyImpl(Graph* graph) const override;
private:
void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -80,7 +80,8 @@ struct Argument { ...@@ -80,7 +80,8 @@ struct Argument {
public: \ public: \
type__& field__() { \ type__& field__() { \
PADDLE_ENFORCE_EQ( \ PADDLE_ENFORCE_EQ( \
Has(#field__), true, \ Has(#field__), \
true, \
platform::errors::PreconditionNotMet("There is no such field")); \ platform::errors::PreconditionNotMet("There is no such field")); \
return field__##_; \ return field__##_; \
} \ } \
...@@ -97,41 +98,45 @@ struct Argument { ...@@ -97,41 +98,45 @@ struct Argument {
#define DECL_ARGUMENT_FIELD_VALID(field__) \ #define DECL_ARGUMENT_FIELD_VALID(field__) \
bool field__##_valid() { return Has(#field__); } bool field__##_valid() { return Has(#field__); }
#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__) \ #define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__) \
public: \ public: \
type__& field__() { \ type__& field__() { \
PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \ PADDLE_ENFORCE_NOT_NULL( \
"filed should not be null.")); \ field__##_, \
PADDLE_ENFORCE_EQ( \ platform::errors::PreconditionNotMet("filed should not be null.")); \
Has(#field__), true, \ PADDLE_ENFORCE_EQ( \
platform::errors::PreconditionNotMet("There is no such field")); \ Has(#field__), \
return *static_cast<type__*>(field__##_.get()); \ true, \
} \ platform::errors::PreconditionNotMet("There is no such field")); \
void Set##Field(type__* x) { \ return *static_cast<type__*>(field__##_.get()); \
field__##_ = \ } \
unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \ void Set##Field(type__* x) { \
valid_fields_.insert(#field__); \ field__##_ = \
} \ unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
void Set##Field##NotOwned(type__* x) { \ valid_fields_.insert(#field__); \
valid_fields_.insert(#field__); \ } \
field__##_ = unique_ptr_t(x, [](void* x) {}); \ void Set##Field##NotOwned(type__* x) { \
} \ valid_fields_.insert(#field__); \
DECL_ARGUMENT_FIELD_VALID(field__); \ field__##_ = unique_ptr_t(x, [](void* x) {}); \
type__* field__##_ptr() { \ } \
PADDLE_ENFORCE_EQ( \ DECL_ARGUMENT_FIELD_VALID(field__); \
Has(#field__), true, \ type__* field__##_ptr() { \
platform::errors::PreconditionNotMet("There is no such field")); \ PADDLE_ENFORCE_EQ( \
return static_cast<type__*>(field__##_.get()); \ Has(#field__), \
} \ true, \
type__* Release##Field() { \ platform::errors::PreconditionNotMet("There is no such field")); \
PADDLE_ENFORCE_EQ( \ return static_cast<type__*>(field__##_.get()); \
Has(#field__), true, \ } \
platform::errors::PreconditionNotMet("There is no such field")); \ type__* Release##Field() { \
valid_fields_.erase(#field__); \ PADDLE_ENFORCE_EQ( \
return static_cast<type__*>(field__##_.release()); \ Has(#field__), \
} \ true, \
\ platform::errors::PreconditionNotMet("There is no such field")); \
private: \ valid_fields_.erase(#field__); \
return static_cast<type__*>(field__##_.release()); \
} \
\
private: \
unique_ptr_t field__##_; unique_ptr_t field__##_;
DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int); DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
...@@ -153,34 +158,40 @@ struct Argument { ...@@ -153,34 +158,40 @@ struct Argument {
DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
// The ir passes to perform in analysis phase. // The ir passes to perform in analysis phase.
DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, DECL_ARGUMENT_FIELD(ir_analysis_passes,
IrAnalysisPasses,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses, DECL_ARGUMENT_FIELD(analysis_passes,
AnalysisPasses,
std::vector<std::string>); std::vector<std::string>);
// whether to mute all logs in inference. // whether to mute all logs in inference.
DECL_ARGUMENT_FIELD(disable_logs, DisableLogs, bool); DECL_ARGUMENT_FIELD(disable_logs, DisableLogs, bool);
// Pass a set of op types to enable its mkldnn kernel // Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types,
MKLDNNEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
// The cache capacity of different input shapes for mkldnn. // The cache capacity of different input shapes for mkldnn.
DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int); DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// A set of op types to enable their quantized kernels // A set of op types to enable their quantized kernels
DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes, DECL_ARGUMENT_FIELD(quantize_enabled_op_types,
QuantizeEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
// A set of op IDs to exclude from enabling their quantized kernels // A set of op IDs to exclude from enabling their quantized kernels
DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds, DECL_ARGUMENT_FIELD(quantize_excluded_op_ids,
QuantizeExcludedOpIds,
std::unordered_set<int>); std::unordered_set<int>);
// Scales for variables to be quantized // Scales for variables to be quantized
DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
// A set of op types to enable their bfloat16 kernels // A set of op types to enable their bfloat16 kernels
DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, Bfloat16EnabledOpTypes, DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types,
Bfloat16EnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
DECL_ARGUMENT_FIELD(use_mkldnn_int8, UseMkldnnInt8, bool); DECL_ARGUMENT_FIELD(use_mkldnn_int8, UseMkldnnInt8, bool);
...@@ -190,9 +201,6 @@ struct Argument { ...@@ -190,9 +201,6 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
std::unordered_set<std::string>);
// Usually use for trt dynamic shape. // Usually use for trt dynamic shape.
// TRT will select the best kernel according to opt shape // TRT will select the best kernel according to opt shape
...@@ -209,25 +217,33 @@ struct Argument { ...@@ -209,25 +217,33 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_disabled_ops, TensorRtDisabledOPs, DECL_ARGUMENT_FIELD(tensorrt_disabled_ops,
TensorRtDisabledOPs,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, DECL_ARGUMENT_FIELD(tensorrt_precision_mode,
TensorRtPrecisionMode,
AnalysisConfig::Precision); AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, DECL_ARGUMENT_FIELD(tensorrt_use_static_engine,
TensorRtUseStaticEngine,
bool); bool);
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool); DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool);
DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool); DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
DECL_ARGUMENT_FIELD(tensorrt_transformer_posid, TensorRtTransformerPosid, DECL_ARGUMENT_FIELD(tensorrt_transformer_posid,
TensorRtTransformerPosid,
std::string); std::string);
DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid, TensorRtTransformerMaskid, DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid,
TensorRtTransformerMaskid,
std::string); std::string);
DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path, DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
TensorRtShapeRangeInfoPath, std::string); TensorRtShapeRangeInfoPath,
DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape, std::string);
DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape,
TensorRtTunedDynamicShape,
bool); bool);
DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime, DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
TensorRtAllowBuildAtRuntime, bool); TensorRtAllowBuildAtRuntime,
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool); DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
...@@ -235,10 +251,12 @@ struct Argument { ...@@ -235,10 +251,12 @@ struct Argument {
DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int); DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int); DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter, DECL_ARGUMENT_FIELD(lite_passes_filter,
LitePassesFilter,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode, DECL_ARGUMENT_FIELD(lite_precision_mode,
LitePrecisionMode,
AnalysisConfig::Precision); AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool); DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
...@@ -252,19 +270,26 @@ struct Argument { ...@@ -252,19 +270,26 @@ struct Argument {
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool);
DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, NNAdapterModelCacheDir, DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir,
NNAdapterModelCacheDir,
std::string); std::string);
DECL_ARGUMENT_FIELD(nnadapter_device_names, NNAdapterDeviceNames, DECL_ARGUMENT_FIELD(nnadapter_device_names,
NNAdapterDeviceNames,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(nnadapter_context_properties, NNAdapterContextProperties, DECL_ARGUMENT_FIELD(nnadapter_context_properties,
NNAdapterContextProperties,
std::string); std::string);
DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_buffer, DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_buffer,
NNAdapterSubgraphPartitionConfigBuffer, std::string); NNAdapterSubgraphPartitionConfigBuffer,
std::string);
DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_path, DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_path,
NNAdapterSubgraphPartitionConfigPath, std::string); NNAdapterSubgraphPartitionConfigPath,
DECL_ARGUMENT_FIELD(nnadapter_model_cache_token, NNAdapterModelCacheToken, std::string);
DECL_ARGUMENT_FIELD(nnadapter_model_cache_token,
NNAdapterModelCacheToken,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer, NNAdapterModelCacheBuffer, DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer,
NNAdapterModelCacheBuffer,
std::vector<std::vector<char>>); std::vector<std::vector<char>>);
// Memory optimized related. // Memory optimized related.
...@@ -275,13 +300,15 @@ struct Argument { ...@@ -275,13 +300,15 @@ struct Argument {
DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
// The program transformed by IR analysis phase. // The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program,
IrAnalyzedProgram,
framework::proto::ProgramDesc); framework::proto::ProgramDesc);
DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
// Only used in paddle-lite subgraph. // Only used in paddle-lite subgraph.
DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads, DECL_ARGUMENT_FIELD(cpu_math_library_num_threads,
CpuMathLibraryNumThreads,
int); int);
// ipu related // ipu related
...@@ -293,7 +320,8 @@ struct Argument { ...@@ -293,7 +320,8 @@ struct Argument {
DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool); DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool);
DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int); DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int);
DECL_ARGUMENT_FIELD(ipu_available_memory_proportion, DECL_ARGUMENT_FIELD(ipu_available_memory_proportion,
IpuAvailableMemoryProportion, float); IpuAvailableMemoryProportion,
float);
DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool); DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool);
// npu related // npu related
...@@ -306,7 +334,8 @@ struct Argument { ...@@ -306,7 +334,8 @@ struct Argument {
#define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \ #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
PADDLE_ENFORCE_EQ( \ PADDLE_ENFORCE_EQ( \
argument__->Has(#fieldname__), true, \ argument__->Has(#fieldname__), \
true, \
platform::errors::PreconditionNotMet( \ platform::errors::PreconditionNotMet( \
"the argument field [%s] should be set", #fieldname__)); "the argument field [%s] should be set", #fieldname__));
......
...@@ -68,12 +68,15 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -68,12 +68,15 @@ void IRPassManager::CreatePasses(Argument *argument,
auto precision_mode = argument->tensorrt_precision_mode(); auto precision_mode = argument->tensorrt_precision_mode();
bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>( pass->Set("max_input_shape",
argument->max_input_shape())); new std::map<std::string, std::vector<int>>(
pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>( argument->max_input_shape()));
argument->min_input_shape())); pass->Set("min_input_shape",
pass->Set("optim_input_shape", new std::map<std::string, std::vector<int>>( new std::map<std::string, std::vector<int>>(
argument->optim_input_shape())); argument->min_input_shape()));
pass->Set("optim_input_shape",
new std::map<std::string, std::vector<int>>(
argument->optim_input_shape()));
// tuned trt dynamic_shape // tuned trt dynamic_shape
pass->Set("trt_tuned_dynamic_shape", pass->Set("trt_tuned_dynamic_shape",
new bool(argument->tensorrt_tuned_dynamic_shape())); new bool(argument->tensorrt_tuned_dynamic_shape()));
...@@ -143,14 +146,16 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -143,14 +146,16 @@ void IRPassManager::CreatePasses(Argument *argument,
bool int8_valid = !(model_from_memory && optim_cache_dir.empty() && bool int8_valid = !(model_from_memory && optim_cache_dir.empty() &&
enable_int8 && use_calib_mode); enable_int8 && use_calib_mode);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
int8_valid, true, int8_valid,
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"When you are in TRT INT8 mode, and load model from " "When you are in TRT INT8 mode, and load model from "
"memory, you should set optim_cache_dir using " "memory, you should set optim_cache_dir using "
"config.SetOptimCacheDir()")); "config.SetOptimCacheDir()"));
if (model_from_memory && use_static_engine) { if (model_from_memory && use_static_engine) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
optim_cache_dir.empty(), false, optim_cache_dir.empty(),
false,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"When you are using Paddle-TRT, and using load model " "When you are using Paddle-TRT, and using load model "
"from memory, and also set the use_static to true. " "from memory, and also set the use_static to true. "
...@@ -161,7 +166,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -161,7 +166,8 @@ void IRPassManager::CreatePasses(Argument *argument,
if (!optim_cache_dir.empty()) { if (!optim_cache_dir.empty()) {
if (!PathExists(optim_cache_dir)) { if (!PathExists(optim_cache_dir)) {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
MKDIR(optim_cache_dir.c_str()), -1, MKDIR(optim_cache_dir.c_str()),
-1,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Can not create optimize cache directory: %s, Make sure you " "Can not create optimize cache directory: %s, Make sure you "
"have permission to write", "have permission to write",
...@@ -187,8 +193,9 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -187,8 +193,9 @@ void IRPassManager::CreatePasses(Argument *argument,
new std::string(argument->tensorrt_shape_range_info_path())); new std::string(argument->tensorrt_shape_range_info_path()));
pass->Set("trt_allow_build_at_runtime", pass->Set("trt_allow_build_at_runtime",
new bool(argument->tensorrt_allow_build_at_runtime())); new bool(argument->tensorrt_allow_build_at_runtime()));
pass->Set("trt_disabled_ops", new std::vector<std::string>( pass->Set(
argument->tensorrt_disabled_ops())); "trt_disabled_ops",
new std::vector<std::string>(argument->tensorrt_disabled_ops()));
pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla())); pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core())); pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
...@@ -200,10 +207,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -200,10 +207,6 @@ void IRPassManager::CreatePasses(Argument *argument,
new int(argument->dlnne_min_subgraph_size())); new int(argument->dlnne_min_subgraph_size()));
pass->Set("program", pass->Set("program",
new framework::ProgramDesc *(&argument->main_program())); new framework::ProgramDesc *(&argument->main_program()));
} else if (pass_name == "mixed_precision_configure_pass") {
pass->Set("gpu_fp16_disabled_op_types",
new std::unordered_set<std::string>(
argument->gpu_fp16_disabled_op_types()));
} }
if (pass_name == "lite_subgraph_pass") { if (pass_name == "lite_subgraph_pass") {
bool lite_enable_int8 = bool lite_enable_int8 =
...@@ -272,8 +275,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { ...@@ -272,8 +275,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
if (passes_.empty()) { if (passes_.empty()) {
return graph; return graph;
} }
PADDLE_ENFORCE_NOT_NULL(graph.get(), platform::errors::PreconditionNotMet( PADDLE_ENFORCE_NOT_NULL(
"Graph cannot be NULL.")); graph.get(),
platform::errors::PreconditionNotMet("Graph cannot be NULL."));
// Apply all the passes // Apply all the passes
for (const auto &pass : passes_) { for (const auto &pass : passes_) {
if (pass->Type() != "graph_viz_pass" && !disable_logs_) { if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -37,7 +36,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { ...@@ -37,7 +36,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
LOG(INFO) << "Sync params from CPU to NPU"; LOG(INFO) << "Sync params from CPU to NPU";
PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), true, PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The npu_device_id field should be valid")); "The npu_device_id field should be valid"));
platform::Place place = platform::NPUPlace(argument->npu_device_id()); platform::Place place = platform::NPUPlace(argument->npu_device_id());
...@@ -46,8 +46,9 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { ...@@ -46,8 +46,9 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
for (auto &var_name : all_vars) { for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name); auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet( PADDLE_ENFORCE_NOT_NULL(
"The var should not be nullptr")); var,
platform::errors::PreconditionNotMet("The var should not be nullptr"));
if (var->IsType<framework::LoDTensor>() || if (var->IsType<framework::LoDTensor>() ||
var->IsType<framework::Tensor>()) { var->IsType<framework::Tensor>()) {
...@@ -67,26 +68,6 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { ...@@ -67,26 +68,6 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
#else #else
void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
const framework::ir::Graph &graph,
std::unordered_map<std::string, std::string> *var_name_op_type_map) {
std::vector<framework::ir::Node *> node_list =
framework::ir::TopologyVarientSort(
graph, static_cast<framework::ir::SortKind>(0));
for (auto *op_node : node_list) {
if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
op_node->Op()->Type() == "fetch")
continue;
for (auto *pre_node : op_node->inputs) {
if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
var_name_op_type_map->insert(std::pair<std::string, std::string>(
pre_node->Var()->Name(), op_node->Op()->Type()));
}
}
}
}
void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary. // The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return; if (!argument->use_gpu()) return;
...@@ -100,7 +81,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { ...@@ -100,7 +81,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
LOG(INFO) << "Sync params from CPU to GPU"; LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true, PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The gpu_device_id field should be valid")); "The gpu_device_id field should be valid"));
platform::Place place = platform::CUDAPlace(argument->gpu_device_id()); platform::Place place = platform::CUDAPlace(argument->gpu_device_id());
...@@ -124,54 +106,34 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { ...@@ -124,54 +106,34 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
if (with_dynamic_shape) { if (with_dynamic_shape) {
reserve_cpu_weights = true; reserve_cpu_weights = true;
} }
bool mixed_precision_mode =
argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
std::unordered_map<std::string, std::string> var_name_op_type_map{};
std::unordered_set<std::string> blacklist{};
if (mixed_precision_mode) {
GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
blacklist = argument->gpu_fp16_disabled_op_types();
}
for (auto &var_name : all_vars) { for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(), if (std::count(
var_name)) { repetitive_params.begin(), repetitive_params.end(), var_name)) {
if (!reserve_cpu_weights) { if (!reserve_cpu_weights) {
scope->EraseVars({var_name}); scope->EraseVars({var_name});
} }
continue; continue;
} }
auto *var = scope->FindLocalVar(var_name); auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet( PADDLE_ENFORCE_NOT_NULL(
"The var should not be nullptr")); var,
platform::errors::PreconditionNotMet("The var should not be nullptr"));
if (var->IsType<framework::LoDTensor>() || if (var->IsType<framework::LoDTensor>() ||
var->IsType<framework::Tensor>()) { var->IsType<framework::Tensor>()) {
auto *t = var->GetMutable<framework::LoDTensor>(); auto *t = var->GetMutable<framework::LoDTensor>();
bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 || platform::CPUPlace cpu_place;
t->dtype() == paddle::experimental::DataType::FLOAT64; framework::LoDTensor temp_tensor;
if (mixed_precision_mode && temp_tensor.Resize(t->dims());
!blacklist.count(var_name_op_type_map[var_name]) && is_float) { temp_tensor.mutable_data<float>(cpu_place);
framework::Tensor half_tensor;
half_tensor.set_type(paddle::experimental::DataType::FLOAT16); // Copy the parameter data to a tmp tensor.
half_tensor.Resize(t->dims()); paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
auto *half_data = // Reallocation the space on GPU
half_tensor.mutable_data<float16>(platform::CPUPlace()); t->clear();
for (int i = 0; i < t->numel(); i++) {
auto *data = t->mutable_data<float>(platform::CPUPlace()); // Copy parameter data to newly allocated GPU space.
half_data[i] = static_cast<float16>(data[i]); paddle::framework::TensorCopySync(temp_tensor, place, t);
}
t->clear();
paddle::framework::TensorCopySync(half_tensor, place, t);
} else {
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
t->clear();
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
} }
} }
} }
...@@ -180,7 +142,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { ...@@ -180,7 +142,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
argument->scope_valid(), true, argument->scope_valid(),
true,
platform::errors::PreconditionNotMet("The scope field should be valid")); platform::errors::PreconditionNotMet("The scope field should be valid"));
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
......
...@@ -38,12 +38,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { ...@@ -38,12 +38,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
void CopyParamsToNpu(Argument *argument); void CopyParamsToNpu(Argument *argument);
#else #else
void CopyParamsToGpu(Argument *argument);
void GetVarNameToOpTypeMap(
const framework::ir::Graph& graph,
std::unordered_map<std::string, std::string>* var_name_op_type_map);
void CopyParamsToGpu(Argument* argument);
#endif #endif
}; };
......
...@@ -84,7 +84,6 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, ...@@ -84,7 +84,6 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
Update(); Update();
} }
void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) { int device_id) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -101,16 +100,18 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, ...@@ -101,16 +100,18 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
} }
void AnalysisConfig::SetExecStream(void *stream) { void AnalysisConfig::SetExecStream(void *stream) {
PADDLE_ENFORCE_NOT_NULL(stream, platform::errors::InvalidArgument( PADDLE_ENFORCE_NOT_NULL(
"`stream` should not be nullptr")); stream,
platform::errors::InvalidArgument("`stream` should not be nullptr"));
exec_stream_ = stream; exec_stream_ = stream;
use_external_stream_ = true; use_external_stream_ = true;
Update(); Update();
} }
void *AnalysisConfig::GetExecStream() const { void *AnalysisConfig::GetExecStream() const {
PADDLE_ENFORCE_NOT_NULL(exec_stream_, platform::errors::InvalidArgument( PADDLE_ENFORCE_NOT_NULL(
"`stream` should not be nullptr")); exec_stream_,
platform::errors::InvalidArgument("`stream` should not be nullptr"));
return exec_stream_; return exec_stream_;
} }
...@@ -124,27 +125,16 @@ void AnalysisConfig::DisableGpu() { ...@@ -124,27 +125,16 @@ void AnalysisConfig::DisableGpu() {
Update(); Update();
} }
void AnalysisConfig::Exp_EnableUseGpuFp16(
std::unordered_set<std::string> op_list) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
use_gpu_fp16_ = true;
gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
#else
LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
use_gpu_fp16_ = false;
#endif
Update();
}
void AnalysisConfig::DisableFCPadding() { void AnalysisConfig::DisableFCPadding() {
use_fc_padding_ = false; use_fc_padding_ = false;
Update(); Update();
} }
void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, void AnalysisConfig::EnableXpu(int l3_workspace_size,
bool autotune, const std::string &autotune_file, bool locked,
bool autotune,
const std::string &autotune_file,
const std::string &precision, const std::string &precision,
bool adaptive_seqlen) { bool adaptive_seqlen) {
use_xpu_ = true; use_xpu_ = true;
...@@ -158,7 +148,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, ...@@ -158,7 +148,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
} }
void AnalysisConfig::SetXpuDeviceId(int device_id) { void AnalysisConfig::SetXpuDeviceId(int device_id) {
PADDLE_ENFORCE_EQ(use_xpu_, true, PADDLE_ENFORCE_EQ(use_xpu_,
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Should call EnableXpu before SetXpuDeviceId.")); "Should call EnableXpu before SetXpuDeviceId."));
xpu_device_id_ = device_id; xpu_device_id_ = device_id;
...@@ -190,7 +181,8 @@ void AnalysisConfig::EnableCustomDevice(const std::string &device_type, ...@@ -190,7 +181,8 @@ void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
Update(); Update();
} }
void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size, void AnalysisConfig::EnableIpu(int ipu_device_num,
int ipu_micro_batch_size,
bool ipu_enable_pipelining, bool ipu_enable_pipelining,
int ipu_batches_per_step) { int ipu_batches_per_step) {
enable_ir_optim_ = true; enable_ir_optim_ = true;
...@@ -204,7 +196,8 @@ void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size, ...@@ -204,7 +196,8 @@ void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
Update(); Update();
} }
void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16,
int ipu_replica_num,
float ipu_available_memory_proportion, float ipu_available_memory_proportion,
bool ipu_enable_half_partial) { bool ipu_enable_half_partial) {
ipu_enable_fp16_ = ipu_enable_fp16; ipu_enable_fp16_ = ipu_enable_fp16;
...@@ -262,8 +255,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -262,8 +255,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_cudnn_); CP_MEMBER(use_cudnn_);
CP_MEMBER(gpu_device_id_); CP_MEMBER(gpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(use_gpu_fp16_);
CP_MEMBER(gpu_fp16_disabled_op_types_);
CP_MEMBER(enable_memory_optim_); CP_MEMBER(enable_memory_optim_);
// TensorRT related. // TensorRT related.
...@@ -366,7 +357,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -366,7 +357,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(custom_device_id_); CP_MEMBER(custom_device_id_);
if (use_gpu_) { if (use_gpu_) {
PADDLE_ENFORCE_EQ(use_xpu_, false, PADDLE_ENFORCE_EQ(use_xpu_,
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
...@@ -406,8 +398,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -406,8 +398,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
std::sort(all_passes.begin(), all_passes.end()); std::sort(all_passes.begin(), all_passes.end());
std::sort(other_passes.begin(), other_passes.end()); std::sort(other_passes.begin(), other_passes.end());
std::vector<std::string> deleted_passes; std::vector<std::string> deleted_passes;
std::set_difference(all_passes.begin(), all_passes.end(), std::set_difference(all_passes.begin(),
other_passes.begin(), other_passes.end(), all_passes.end(),
other_passes.begin(),
other_passes.end(),
std::inserter(deleted_passes, deleted_passes.begin())); std::inserter(deleted_passes, deleted_passes.begin()));
for (auto ps : deleted_passes) { for (auto ps : deleted_passes) {
pass_builder_->DeletePass(ps); pass_builder_->DeletePass(ps);
...@@ -516,8 +510,11 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { ...@@ -516,8 +510,11 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
} }
void AnalysisConfig::EnableTensorRtEngine( void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size, int workspace_size,
AnalysisConfig::Precision precision_mode, bool use_static, int max_batch_size,
int min_subgraph_size,
AnalysisConfig::Precision precision_mode,
bool use_static,
bool use_calib_mode) { bool use_calib_mode) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!use_gpu()) { if (!use_gpu()) {
...@@ -594,19 +591,22 @@ void AnalysisConfig::Update() { ...@@ -594,19 +591,22 @@ void AnalysisConfig::Update() {
pass_builder_.reset(new IpuPassStrategy); pass_builder_.reset(new IpuPassStrategy);
} else if (use_xpu()) { } else if (use_xpu()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), false, use_gpu(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy); pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu()) { } else if (use_npu()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), false, use_gpu(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU.")); "Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy); pass_builder_.reset(new NpuPassStrategy);
} else if (use_custom_device()) { } else if (use_custom_device()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), false, use_gpu(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between GPU and CustomDevice.")); "Only one choice can be made between GPU and CustomDevice."));
pass_builder_.reset(new CustomDevicePassStrategy); pass_builder_.reset(new CustomDevicePassStrategy);
...@@ -624,21 +624,24 @@ void AnalysisConfig::Update() { ...@@ -624,21 +624,24 @@ void AnalysisConfig::Update() {
*static_cast<IpuPassStrategy *>(pass_builder_.get()))); *static_cast<IpuPassStrategy *>(pass_builder_.get())));
} else if (use_xpu()) { } else if (use_xpu()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), false, use_gpu(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy( pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(pass_builder_.get()))); *static_cast<XpuPassStrategy *>(pass_builder_.get())));
} else if (use_npu()) { } else if (use_npu()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), false, use_gpu(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU.")); "Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy( pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(pass_builder_.get()))); *static_cast<NpuPassStrategy *>(pass_builder_.get())));
} else if (use_custom_device()) { } else if (use_custom_device()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
use_gpu(), false, use_gpu(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between GPU and CustomDevice.")); "Only one choice can be made between GPU and CustomDevice."));
pass_builder_.reset(new CustomDevicePassStrategy( pass_builder_.reset(new CustomDevicePassStrategy(
...@@ -677,20 +680,6 @@ void AnalysisConfig::Update() { ...@@ -677,20 +680,6 @@ void AnalysisConfig::Update() {
#endif #endif
} }
if (use_gpu_fp16_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!enable_ir_optim_) {
LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
"enabled.";
} else if (!use_gpu()) {
LOG(ERROR)
<< "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
} else {
pass_builder()->Exp_EnableUseGpuFp16();
}
#endif
}
if (use_mkldnn_) { if (use_mkldnn_) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (!enable_ir_optim_) { if (!enable_ir_optim_) {
...@@ -749,7 +738,8 @@ void AnalysisConfig::Update() { ...@@ -749,7 +738,8 @@ void AnalysisConfig::Update() {
#endif #endif
pass_builder()->ClearPasses(); pass_builder()->ClearPasses();
for (const auto &pass : kLiteSubgraphPasses) { for (const auto &pass : kLiteSubgraphPasses) {
if (std::find(lite_passes_filter_.begin(), lite_passes_filter_.end(), if (std::find(lite_passes_filter_.begin(),
lite_passes_filter_.end(),
pass) == lite_passes_filter_.end()) { pass) == lite_passes_filter_.end()) {
pass_builder()->AppendPass(pass); pass_builder()->AppendPass(pass);
} }
...@@ -758,7 +748,8 @@ void AnalysisConfig::Update() { ...@@ -758,7 +748,8 @@ void AnalysisConfig::Update() {
if (use_xpu_) { if (use_xpu_) {
#if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU) #if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU)
PADDLE_ENFORCE_EQ(use_gpu_, false, PADDLE_ENFORCE_EQ(use_gpu_,
false,
platform::errors::Unavailable( platform::errors::Unavailable(
"Currently, XPU and GPU cannot be enabled in the " "Currently, XPU and GPU cannot be enabled in the "
"same analysis configuration.")); "same analysis configuration."));
...@@ -771,7 +762,8 @@ void AnalysisConfig::Update() { ...@@ -771,7 +762,8 @@ void AnalysisConfig::Update() {
if (use_npu_) { if (use_npu_) {
#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU) #if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
PADDLE_ENFORCE_EQ(use_gpu_, false, PADDLE_ENFORCE_EQ(use_gpu_,
false,
platform::errors::Unavailable( platform::errors::Unavailable(
"Currently, NPU and GPU cannot be enabled in the " "Currently, NPU and GPU cannot be enabled in the "
"same analysis configuration.")); "same analysis configuration."));
...@@ -809,8 +801,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -809,8 +801,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_gpu_; ss << use_gpu_;
ss << use_external_stream_; ss << use_external_stream_;
ss << exec_stream_; ss << exec_stream_;
ss << use_gpu_fp16_;
for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
ss << use_fc_padding_; ss << use_fc_padding_;
ss << gpu_device_id_; ss << gpu_device_id_;
ss << xpu_device_id_; ss << xpu_device_id_;
...@@ -957,7 +947,8 @@ void AnalysisConfig::DisableGlogInfo() { ...@@ -957,7 +947,8 @@ void AnalysisConfig::DisableGlogInfo() {
} }
void AnalysisConfig::EnableLiteEngine( void AnalysisConfig::EnableLiteEngine(
AnalysisConfig::Precision precision_mode, bool zero_copy, AnalysisConfig::Precision precision_mode,
bool zero_copy,
const std::vector<std::string> &passes_filter, const std::vector<std::string> &passes_filter,
const std::vector<std::string> &ops_filter) { const std::vector<std::string> &ops_filter) {
use_lite_ = true; use_lite_ = true;
...@@ -1057,9 +1048,9 @@ std::string AnalysisConfig::Summary() { ...@@ -1057,9 +1048,9 @@ std::string AnalysisConfig::Summary() {
// dynamic_shape // dynamic_shape
os.InsertRow({"tensorrt_enable_dynamic_shape", os.InsertRow({"tensorrt_enable_dynamic_shape",
min_input_shape_.empty() ? "false" : "true"}); min_input_shape_.empty() ? "false" : "true"});
os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_ os.InsertRow(
? shape_range_info_path_ {"tensorrt_tuned_dynamic_shape",
: "false"}); trt_tuned_dynamic_shape_ ? shape_range_info_path_ : "false"});
os.InsertRow( os.InsertRow(
{"tensorrt_use_varseqlen", trt_use_varseqlen_ ? "true" : "false"}); {"tensorrt_use_varseqlen", trt_use_varseqlen_ ? "true" : "false"});
...@@ -1123,10 +1114,12 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheDir( ...@@ -1123,10 +1114,12 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheDir(
LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers( LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers(
const std::string &model_cache_token, const std::string &model_cache_token,
const std::vector<char> &model_cache_buffer) { const std::vector<char> &model_cache_buffer) {
PADDLE_ENFORCE_EQ(model_cache_token.empty(), false, PADDLE_ENFORCE_EQ(model_cache_token.empty(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"model_cache_token should not be empty.")); "model_cache_token should not be empty."));
PADDLE_ENFORCE_EQ(model_cache_buffer.empty(), false, PADDLE_ENFORCE_EQ(model_cache_buffer.empty(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"model_cache_buffer should not be empty.")); "model_cache_buffer should not be empty."));
PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token), PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token),
...@@ -1165,7 +1158,8 @@ void AnalysisConfig::CollectShapeRangeInfo( ...@@ -1165,7 +1158,8 @@ void AnalysisConfig::CollectShapeRangeInfo(
<< "all intermediate tensors in the compute graph and calculate " << "all intermediate tensors in the compute graph and calculate "
"the min_shape, max_shape and opt_shape."; "the min_shape, max_shape and opt_shape.";
collect_shape_range_info_ = true; collect_shape_range_info_ = true;
PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false, PADDLE_ENFORCE_EQ(shape_range_info_path.empty(),
false,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The shape_range_info_path should not be empty, please " "The shape_range_info_path should not be empty, please "
"re-check the argument.")); "re-check the argument."));
......
...@@ -1048,11 +1048,6 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1048,11 +1048,6 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
} }
if (config_.gpu_fp16_enabled()) {
argument_.SetUseGPUFp16(true);
argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
}
if (config_.lite_engine_enabled()) { if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads( argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads()); config_.cpu_math_library_num_threads());
......
...@@ -371,19 +371,6 @@ TEST(AnalysisPredictor, enable_onnxruntime) { ...@@ -371,19 +371,6 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
ASSERT_TRUE(!config.use_onnxruntime()); ASSERT_TRUE(!config.use_onnxruntime());
} }
TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
AnalysisConfig config;
config.SwitchIrOptim();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0);
config.Exp_EnableUseGpuFp16();
ASSERT_TRUE(config.gpu_fp16_enabled());
#else
config.DisableGpu();
#endif
LOG(INFO) << config.Summary();
}
} // namespace paddle } // namespace paddle
namespace paddle_infer { namespace paddle_infer {
...@@ -443,19 +430,6 @@ TEST(Predictor, EnableONNXRuntime) { ...@@ -443,19 +430,6 @@ TEST(Predictor, EnableONNXRuntime) {
auto predictor = CreatePredictor(config); auto predictor = CreatePredictor(config);
} }
TEST(Predictor, Exp_EnableUseGpuFp16) {
Config config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
config.EnableUseGpu(100, 0);
config.Exp_EnableUseGpuFp16();
#else
config.DisableGpu();
#endif
auto predictor = CreatePredictor(config);
}
TEST(Tensor, CpuShareExternalData) { TEST(Tensor, CpuShareExternalData) {
Config config; Config config;
config.SetModel(FLAGS_dirname); config.SetModel(FLAGS_dirname);
...@@ -476,8 +450,8 @@ TEST(Tensor, CpuShareExternalData) { ...@@ -476,8 +450,8 @@ TEST(Tensor, CpuShareExternalData) {
auto out = predictor->GetOutputHandle("fc_1.tmp_2"); auto out = predictor->GetOutputHandle("fc_1.tmp_2");
auto out_shape = out->shape(); auto out_shape = out->shape();
std::vector<float> out_data; std::vector<float> out_data;
out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1, out_data.resize(std::accumulate(
std::multiplies<int>())); out_shape.begin(), out_shape.end(), 1, std::multiplies<int>()));
out->ShareExternalData<float>(out_data.data(), out_shape, PlaceType::kCPU); out->ShareExternalData<float>(out_data.data(), out_shape, PlaceType::kCPU);
predictor->Run(); predictor->Run();
...@@ -507,7 +481,9 @@ TEST(Tensor, GpuShareExternalData) { ...@@ -507,7 +481,9 @@ TEST(Tensor, GpuShareExternalData) {
for (size_t i = 0; i < 4; ++i) { for (size_t i = 0; i < 4; ++i) {
cudaMalloc(reinterpret_cast<void**>(&input_gpu[i]), 4 * sizeof(int64_t)); cudaMalloc(reinterpret_cast<void**>(&input_gpu[i]), 4 * sizeof(int64_t));
cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t), cudaMemcpy(input_gpu[i],
input_data[i].data(),
4 * sizeof(int64_t),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
} }
...@@ -519,9 +495,10 @@ TEST(Tensor, GpuShareExternalData) { ...@@ -519,9 +495,10 @@ TEST(Tensor, GpuShareExternalData) {
auto out = predictor->GetOutputHandle("fc_1.tmp_2"); auto out = predictor->GetOutputHandle("fc_1.tmp_2");
auto out_shape = out->shape(); auto out_shape = out->shape();
float* out_data = nullptr; float* out_data = nullptr;
auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1, auto out_size =
std::multiplies<int>()) * std::accumulate(
sizeof(float); out_shape.begin(), out_shape.end(), 1, std::multiplies<int>()) *
sizeof(float);
cudaMalloc(reinterpret_cast<void**>(out_data), out_size * sizeof(float)); cudaMalloc(reinterpret_cast<void**>(out_data), out_size * sizeof(float));
out->ShareExternalData<float>(out_data, out_shape, PlaceType::kGPU); out->ShareExternalData<float>(out_data, out_shape, PlaceType::kGPU);
......
...@@ -253,19 +253,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -253,19 +253,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
/// ///
void DisableGpu(); void DisableGpu();
///
/// \brief Enable GPU fp16 precision computation, in experimental state.
///
/// \param op_list The operator type list.
///
void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
///
/// \brief A boolean state telling whether the GPU fp16 precision is turned
/// on.
///
/// \return bool Whether the GPU fp16 precision is turned on.
///
bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
/// ///
/// \brief Turn on XPU. /// \brief Turn on XPU.
...@@ -287,8 +274,10 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -287,8 +274,10 @@ struct PD_INFER_DECL AnalysisConfig {
/// \param precision Calculation accuracy of multi_encoder /// \param precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param adaptive_seqlen Is the input of multi_encoder variable length
/// ///
void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, void EnableXpu(int l3_workspace_size = 0xfffc00,
bool autotune = true, const std::string& autotune_file = "", bool locked = false,
bool autotune = true,
const std::string& autotune_file = "",
const std::string& precision = "int16", const std::string& precision = "int16",
bool adaptive_seqlen = false); bool adaptive_seqlen = false);
...@@ -301,7 +290,8 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -301,7 +290,8 @@ struct PD_INFER_DECL AnalysisConfig {
/// \param ipu_enable_pipelining enable pipelining. /// \param ipu_enable_pipelining enable pipelining.
/// \param ipu_batches_per_step the number of batches per run in pipelining. /// \param ipu_batches_per_step the number of batches per run in pipelining.
/// ///
void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1, void EnableIpu(int ipu_device_num = 1,
int ipu_micro_batch_size = 1,
bool ipu_enable_pipelining = false, bool ipu_enable_pipelining = false,
int ipu_batches_per_step = 1); int ipu_batches_per_step = 1);
...@@ -315,7 +305,8 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -315,7 +305,8 @@ struct PD_INFER_DECL AnalysisConfig {
/// \param ipu_enable_half_partial enable fp16 partial for matmul, only work /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work
/// with fp16. /// with fp16.
/// ///
void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1, void SetIpuConfig(bool ipu_enable_fp16 = false,
int ipu_replica_num = 1,
float ipu_available_memory_proportion = 1.0, float ipu_available_memory_proportion = 1.0,
bool ipu_enable_half_partial = false); bool ipu_enable_half_partial = false);
...@@ -525,7 +516,8 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -525,7 +516,8 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
/// ///
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3, int max_batch_size = 1,
int min_subgraph_size = 3,
Precision precision = Precision::kFloat32, Precision precision = Precision::kFloat32,
bool use_static = false, bool use_static = false,
bool use_calib_mode = true); bool use_calib_mode = true);
...@@ -821,8 +813,10 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -821,8 +813,10 @@ struct PD_INFER_DECL AnalysisConfig {
/// \param params_buffer The memory buffer of the combined parameters file. /// \param params_buffer The memory buffer of the combined parameters file.
/// \param params_buffer_size The size of the combined parameters data. /// \param params_buffer_size The size of the combined parameters data.
/// ///
void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, void SetModelBuffer(const char* prog_buffer,
const char* params_buffer, size_t params_buffer_size); size_t prog_buffer_size,
const char* params_buffer,
size_t params_buffer_size);
/// ///
/// \brief A boolean state telling whether the model is set from the CPU /// \brief A boolean state telling whether the model is set from the CPU
/// memory. /// memory.
...@@ -929,20 +923,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -929,20 +923,6 @@ struct PD_INFER_DECL AnalysisConfig {
int gpu_device_id_{0}; int gpu_device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
bool thread_local_stream_{false}; bool thread_local_stream_{false};
bool use_gpu_fp16_{false};
std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
"conv2d_fusion",
"conv2d",
"roll",
"strided_slice",
"depthwise_conv2d",
"unfold",
"generate_proposals_v2",
"nearest_interp_v2",
"bilinear_interp_v2"
"yolo_box",
"multiclass_nms3",
"matrix_nms"};
bool use_cudnn_{false}; bool use_cudnn_{false};
bool use_external_stream_{false}; bool use_external_stream_{false};
......
...@@ -194,40 +194,6 @@ void GpuPassStrategy::EnableCUDNN() { ...@@ -194,40 +194,6 @@ void GpuPassStrategy::EnableCUDNN() {
use_cudnn_ = true; use_cudnn_ = true;
} }
void GpuPassStrategy::Exp_EnableUseGpuFp16() {
passes_.assign({
"is_test_pass", //
"simplify_with_basic_ops_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"embedding_eltwise_layernorm_fuse_pass", //
"multihead_matmul_fuse_pass_v2", //
"gpu_cpu_squeeze2_matmul_fuse_pass", //
"gpu_cpu_reshape2_matmul_fuse_pass", //
"gpu_cpu_flatten2_matmul_fuse_pass", //
"gpu_cpu_map_matmul_v2_to_mul_pass", //
"gpu_cpu_map_matmul_v2_to_matmul_pass", //
"gpu_cpu_map_matmul_to_mul_pass", //
// "fc_fuse_pass", //
"fc_elementwise_layernorm_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
// disable the pass.
#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
#endif
"conv_elementwise_add_fuse_pass", //
#endif //
"transpose_flatten_concat_fuse_pass", //
"mixed_precision_configure_pass", //
"runtime_context_cache_pass" //
});
use_gpu_fp16_ = true;
}
void GpuPassStrategy::EnableMKLDNN() { void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
......
...@@ -109,8 +109,11 @@ class PD_INFER_DECL PaddlePassBuilder { ...@@ -109,8 +109,11 @@ class PD_INFER_DECL PaddlePassBuilder {
protected: protected:
/// \cond Protected /// \cond Protected
std::vector<std::string> analysis_passes_{ std::vector<std::string> analysis_passes_{
{"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass", {"ir_graph_build_pass",
"ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass", "ir_graph_clean_pass",
"ir_analysis_pass",
"ir_params_sync_among_devices_pass",
"adjust_cudnn_workspace_size_pass",
"inference_op_replace_pass"}}; "inference_op_replace_pass"}};
std::vector<std::string> passes_; std::vector<std::string> passes_;
/// \endcond /// \endcond
...@@ -129,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -129,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable the use of cuDNN kernel. /// \brief Enable the use of cuDNN kernel.
virtual void EnableCUDNN() {} virtual void EnableCUDNN() {}
/// \brief Enable use gpu fp16 kernel.
virtual void Exp_EnableUseGpuFp16() {}
/// \brief Enable the use of MKLDNN. /// \brief Enable the use of MKLDNN.
/// The MKLDNN control exists in both CPU and GPU mode, because there can /// The MKLDNN control exists in both CPU and GPU mode, because there can
/// still be some CPU kernels running in GPU mode. /// still be some CPU kernels running in GPU mode.
...@@ -150,10 +150,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -150,10 +150,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in gpu mode. /// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
/// \brief Check if we are using gpu fp16 kernel.
/// \return A bool variable implying whether we are in gpu fp16 mode.
bool use_gpu_fp16() const { return use_gpu_fp16_; }
/// \brief Check if we are using xpu. /// \brief Check if we are using xpu.
/// \return A bool variable implying whether we are in xpu mode. /// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; } bool use_xpu() const { return use_xpu_; }
...@@ -180,7 +176,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -180,7 +176,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
bool use_npu_{false}; bool use_npu_{false};
bool use_ipu_{false}; bool use_ipu_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
bool use_gpu_fp16_{false};
bool use_custom_device_{false}; bool use_custom_device_{false};
/// \endcond /// \endcond
}; };
...@@ -248,9 +243,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -248,9 +243,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Enable the use of cuDNN kernel. /// \brief Enable the use of cuDNN kernel.
void EnableCUDNN() override; void EnableCUDNN() override;
/// \brief Enable the use of gpu fp16 kernel.
void Exp_EnableUseGpuFp16() override;
/// \brief Not supported in GPU mode yet. /// \brief Not supported in GPU mode yet.
void EnableMKLDNN() override; void EnableMKLDNN() override;
...@@ -269,7 +261,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -269,7 +261,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
protected: protected:
/// \cond Protected /// \cond Protected
bool use_cudnn_{false}; bool use_cudnn_{false};
bool use_gpu_fp16_{false};
/// \endcond /// \endcond
}; };
......
...@@ -113,7 +113,8 @@ template <typename T> ...@@ -113,7 +113,8 @@ template <typename T>
PaddleBuf PaddleBufCreate( PaddleBuf PaddleBufCreate(
py::array_t<T, py::array::c_style | py::array::forcecast> data) { py::array_t<T, py::array::c_style | py::array::forcecast> data) {
PaddleBuf buf(data.size() * sizeof(T)); PaddleBuf buf(data.size() * sizeof(T));
std::copy_n(static_cast<const T *>(data.data()), data.size(), std::copy_n(static_cast<const T *>(data.data()),
data.size(),
static_cast<T *>(buf.data())); static_cast<T *>(buf.data()));
return buf; return buf;
} }
...@@ -123,7 +124,8 @@ void PaddleBufReset( ...@@ -123,7 +124,8 @@ void PaddleBufReset(
PaddleBuf &buf, // NOLINT PaddleBuf &buf, // NOLINT
py::array_t<T, py::array::c_style | py::array::forcecast> data) { // NOLINT py::array_t<T, py::array::c_style | py::array::forcecast> data) { // NOLINT
buf.Resize(data.size() * sizeof(T)); buf.Resize(data.size() * sizeof(T));
std::copy_n(static_cast<const T *>(data.data()), data.size(), std::copy_n(static_cast<const T *>(data.data()),
data.size(),
static_cast<T *>(buf.data())); static_cast<T *>(buf.data()));
} }
...@@ -131,12 +133,14 @@ template <typename T> ...@@ -131,12 +133,14 @@ template <typename T>
PaddleTensor PaddleTensorCreate( PaddleTensor PaddleTensorCreate(
py::array_t<T, py::array::c_style | py::array::forcecast> data, py::array_t<T, py::array::c_style | py::array::forcecast> data,
const std::string name = "", const std::string name = "",
const std::vector<std::vector<size_t>> &lod = {}, bool copy = true) { const std::vector<std::vector<size_t>> &lod = {},
bool copy = true) {
PaddleTensor tensor; PaddleTensor tensor;
if (copy) { if (copy) {
PaddleBuf buf(data.size() * sizeof(T)); PaddleBuf buf(data.size() * sizeof(T));
std::copy_n(static_cast<const T *>(data.data()), data.size(), std::copy_n(static_cast<const T *>(data.data()),
data.size(),
static_cast<T *>(buf.data())); static_cast<T *>(buf.data()));
tensor.data = std::move(buf); tensor.data = std::move(buf);
} else { } else {
...@@ -235,11 +239,13 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor, // NOLINT ...@@ -235,11 +239,13 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor, // NOLINT
} }
if (input_tensor.dtype() == phi::DataType::FLOAT32) { if (input_tensor.dtype() == phi::DataType::FLOAT32) {
tensor.ShareExternalData( tensor.ShareExternalData(
static_cast<float *>(input_tensor.data()), shape, static_cast<float *>(input_tensor.data()),
shape,
ToPaddleInferPlace(input_tensor.place().GetType())); ToPaddleInferPlace(input_tensor.place().GetType()));
} else if (input_tensor.dtype() == phi::DataType::FLOAT16) { } else if (input_tensor.dtype() == phi::DataType::FLOAT16) {
tensor.ShareExternalData( tensor.ShareExternalData(
static_cast<paddle::platform::float16 *>(input_tensor.data()), shape, static_cast<paddle::platform::float16 *>(input_tensor.data()),
shape,
ToPaddleInferPlace(input_tensor.place().GetType())); ToPaddleInferPlace(input_tensor.place().GetType()));
} }
} }
...@@ -379,9 +385,11 @@ void BindInferenceApi(py::module *m) { ...@@ -379,9 +385,11 @@ void BindInferenceApi(py::module *m) {
BindMkldnnQuantizerConfig(m); BindMkldnnQuantizerConfig(m);
#endif #endif
m->def("create_paddle_predictor", m->def("create_paddle_predictor",
&paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config")); &paddle::CreatePaddlePredictor<AnalysisConfig>,
py::arg("config"));
m->def("create_paddle_predictor", m->def("create_paddle_predictor",
&paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config")); &paddle::CreatePaddlePredictor<NativeConfig>,
py::arg("config"));
m->def("create_predictor", m->def("create_predictor",
[](const paddle_infer::Config &config) [](const paddle_infer::Config &config)
-> std::unique_ptr<paddle_infer::Predictor> { -> std::unique_ptr<paddle_infer::Predictor> {
...@@ -478,15 +486,18 @@ void BindPaddleBuf(py::module *m) { ...@@ -478,15 +486,18 @@ void BindPaddleBuf(py::module *m) {
void BindPaddleTensor(py::module *m) { void BindPaddleTensor(py::module *m) {
py::class_<PaddleTensor>(*m, "PaddleTensor") py::class_<PaddleTensor>(*m, "PaddleTensor")
.def(py::init<>()) .def(py::init<>())
.def(py::init(&PaddleTensorCreate<int32_t>), py::arg("data"), .def(py::init(&PaddleTensorCreate<int32_t>),
py::arg("data"),
py::arg("name") = "", py::arg("name") = "",
py::arg("lod") = std::vector<std::vector<size_t>>(), py::arg("lod") = std::vector<std::vector<size_t>>(),
py::arg("copy") = true) py::arg("copy") = true)
.def(py::init(&PaddleTensorCreate<int64_t>), py::arg("data"), .def(py::init(&PaddleTensorCreate<int64_t>),
py::arg("data"),
py::arg("name") = "", py::arg("name") = "",
py::arg("lod") = std::vector<std::vector<size_t>>(), py::arg("lod") = std::vector<std::vector<size_t>>(),
py::arg("copy") = true) py::arg("copy") = true)
.def(py::init(&PaddleTensorCreate<float>), py::arg("data"), .def(py::init(&PaddleTensorCreate<float>),
py::arg("data"),
py::arg("name") = "", py::arg("name") = "",
py::arg("lod") = std::vector<std::vector<size_t>>(), py::arg("lod") = std::vector<std::vector<size_t>>(),
py::arg("copy") = true) py::arg("copy") = true)
...@@ -563,7 +574,8 @@ void BindNativePredictor(py::module *m) { ...@@ -563,7 +574,8 @@ void BindNativePredictor(py::module *m) {
.def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor) .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
.def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun) .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
.def("clone", &NativePaddlePredictor::Clone) .def("clone", &NativePaddlePredictor::Clone)
.def("scope", &NativePaddlePredictor::scope, .def("scope",
&NativePaddlePredictor::scope,
py::return_value_policy::reference); py::return_value_policy::reference);
} }
...@@ -581,8 +593,9 @@ void BindAnalysisConfig(py::module *m) { ...@@ -581,8 +593,9 @@ void BindAnalysisConfig(py::module *m) {
.def(py::init<const std::string &>()) .def(py::init<const std::string &>())
.def(py::init<const std::string &, const std::string &>()) .def(py::init<const std::string &, const std::string &>())
.def("summary", &AnalysisConfig::Summary) .def("summary", &AnalysisConfig::Summary)
.def("set_model", (void(AnalysisConfig::*)(const std::string &)) & .def("set_model",
AnalysisConfig::SetModel) (void(AnalysisConfig::*)(const std::string &)) &
AnalysisConfig::SetModel)
.def("set_model", .def("set_model",
(void(AnalysisConfig::*)(const std::string &, const std::string &)) & (void(AnalysisConfig::*)(const std::string &, const std::string &)) &
AnalysisConfig::SetModel) AnalysisConfig::SetModel)
...@@ -591,25 +604,32 @@ void BindAnalysisConfig(py::module *m) { ...@@ -591,25 +604,32 @@ void BindAnalysisConfig(py::module *m) {
.def("model_dir", &AnalysisConfig::model_dir) .def("model_dir", &AnalysisConfig::model_dir)
.def("prog_file", &AnalysisConfig::prog_file) .def("prog_file", &AnalysisConfig::prog_file)
.def("params_file", &AnalysisConfig::params_file) .def("params_file", &AnalysisConfig::params_file)
.def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, .def("enable_use_gpu",
py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) &AnalysisConfig::EnableUseGpu,
.def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16, py::arg("memory_pool_init_size_mb"),
py::arg("gpu_fp16_disabled_op_types") = py::arg("device_id") = 0)
std::unordered_set<std::string>({})) .def("enable_xpu",
.def("enable_xpu", &AnalysisConfig::EnableXpu, &AnalysisConfig::EnableXpu,
py::arg("l3_workspace_size") = 16 * 1024 * 1024, py::arg("l3_workspace_size") = 16 * 1024 * 1024,
py::arg("locked") = false, py::arg("autotune") = true, py::arg("locked") = false,
py::arg("autotune_file") = "", py::arg("precision") = "int16", py::arg("autotune") = true,
py::arg("autotune_file") = "",
py::arg("precision") = "int16",
py::arg("adaptive_seqlen") = false) py::arg("adaptive_seqlen") = false)
.def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId, .def("set_xpu_device_id",
&AnalysisConfig::SetXpuDeviceId,
py::arg("device_id") = 0) py::arg("device_id") = 0)
.def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0) .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
.def("enable_ipu", &AnalysisConfig::EnableIpu, .def("enable_ipu",
py::arg("ipu_device_num") = 1, py::arg("ipu_micro_batch_size") = 1, &AnalysisConfig::EnableIpu,
py::arg("ipu_device_num") = 1,
py::arg("ipu_micro_batch_size") = 1,
py::arg("ipu_enable_pipelining") = false, py::arg("ipu_enable_pipelining") = false,
py::arg("ipu_batches_per_step") = 1) py::arg("ipu_batches_per_step") = 1)
.def("set_ipu_config", &AnalysisConfig::SetIpuConfig, .def("set_ipu_config",
py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1, &AnalysisConfig::SetIpuConfig,
py::arg("ipu_enable_fp16") = false,
py::arg("ipu_replica_num") = 1,
py::arg("ipu_available_memory_proportion") = 1.0, py::arg("ipu_available_memory_proportion") = 1.0,
py::arg("ipu_enable_half_partial") = false) py::arg("ipu_enable_half_partial") = false)
.def("disable_gpu", &AnalysisConfig::DisableGpu) .def("disable_gpu", &AnalysisConfig::DisableGpu)
...@@ -627,27 +647,34 @@ void BindAnalysisConfig(py::module *m) { ...@@ -627,27 +647,34 @@ void BindAnalysisConfig(py::module *m) {
&AnalysisConfig::memory_pool_init_size_mb) &AnalysisConfig::memory_pool_init_size_mb)
.def("fraction_of_gpu_memory_for_pool", .def("fraction_of_gpu_memory_for_pool",
&AnalysisConfig::fraction_of_gpu_memory_for_pool) &AnalysisConfig::fraction_of_gpu_memory_for_pool)
.def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim, .def("switch_ir_optim",
&AnalysisConfig::SwitchIrOptim,
py::arg("x") = true) py::arg("x") = true)
.def("ir_optim", &AnalysisConfig::ir_optim) .def("ir_optim", &AnalysisConfig::ir_optim)
.def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim, .def("enable_memory_optim",
&AnalysisConfig::EnableMemoryOptim,
py::arg("x") = true) py::arg("x") = true)
.def("enable_profile", &AnalysisConfig::EnableProfile) .def("enable_profile", &AnalysisConfig::EnableProfile)
.def("disable_glog_info", &AnalysisConfig::DisableGlogInfo) .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo)
.def("glog_info_disabled", &AnalysisConfig::glog_info_disabled) .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled)
.def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir) .def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir)
.def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps, .def("switch_use_feed_fetch_ops",
&AnalysisConfig::SwitchUseFeedFetchOps,
py::arg("x") = true) py::arg("x") = true)
.def("use_feed_fetch_ops_enabled", .def("use_feed_fetch_ops_enabled",
&AnalysisConfig::use_feed_fetch_ops_enabled) &AnalysisConfig::use_feed_fetch_ops_enabled)
.def("switch_specify_input_names", .def("switch_specify_input_names",
&AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true) &AnalysisConfig::SwitchSpecifyInputNames,
py::arg("x") = true)
.def("specify_input_name", &AnalysisConfig::specify_input_name) .def("specify_input_name", &AnalysisConfig::specify_input_name)
.def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, .def("enable_tensorrt_engine",
py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, &AnalysisConfig::EnableTensorRtEngine,
py::arg("workspace_size") = 1 << 20,
py::arg("max_batch_size") = 1,
py::arg("min_subgraph_size") = 3, py::arg("min_subgraph_size") = 3,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("use_static") = false, py::arg("use_calib_mode") = true) py::arg("use_static") = false,
py::arg("use_calib_mode") = true)
.def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode) .def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode)
.def("set_trt_dynamic_shape_info", .def("set_trt_dynamic_shape_info",
&AnalysisConfig::SetTRTDynamicShapeInfo, &AnalysisConfig::SetTRTDynamicShapeInfo,
...@@ -674,7 +701,8 @@ void BindAnalysisConfig(py::module *m) { ...@@ -674,7 +701,8 @@ void BindAnalysisConfig(py::module *m) {
.def("trt_allow_build_at_runtime", .def("trt_allow_build_at_runtime",
&AnalysisConfig::trt_allow_build_at_runtime) &AnalysisConfig::trt_allow_build_at_runtime)
.def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs) .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
.def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA, .def("enable_tensorrt_dla",
&AnalysisConfig::EnableTensorRtDLA,
py::arg("dla_core") = 0) py::arg("dla_core") = 0)
.def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
.def("enable_tensorrt_inspector", .def("enable_tensorrt_inspector",
...@@ -682,15 +710,18 @@ void BindAnalysisConfig(py::module *m) { ...@@ -682,15 +710,18 @@ void BindAnalysisConfig(py::module *m) {
.def("tensorrt_inspector_enabled", .def("tensorrt_inspector_enabled",
&AnalysisConfig::tensorrt_inspector_enabled) &AnalysisConfig::tensorrt_inspector_enabled)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("enable_dlnne", &AnalysisConfig::EnableDlnne, .def("enable_dlnne",
&AnalysisConfig::EnableDlnne,
py::arg("min_subgraph_size") = 3) py::arg("min_subgraph_size") = 3)
.def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, .def("enable_lite_engine",
&AnalysisConfig::EnableLiteEngine,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("zero_copy") = false, py::arg("zero_copy") = false,
py::arg("passes_filter") = std::vector<std::string>(), py::arg("passes_filter") = std::vector<std::string>(),
py::arg("ops_filter") = std::vector<std::string>()) py::arg("ops_filter") = std::vector<std::string>())
.def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled) .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled)
.def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, .def("switch_ir_debug",
&AnalysisConfig::SwitchIrDebug,
py::arg("x") = true) py::arg("x") = true)
.def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN) .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
.def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled) .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
...@@ -702,12 +733,15 @@ void BindAnalysisConfig(py::module *m) { ...@@ -702,12 +733,15 @@ void BindAnalysisConfig(py::module *m) {
.def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer) .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16) .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
.def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config, .def("quantizer_config",
&AnalysisConfig::mkldnn_quantizer_config,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("set_mkldnn_cache_capacity", &AnalysisConfig::SetMkldnnCacheCapacity, .def("set_mkldnn_cache_capacity",
&AnalysisConfig::SetMkldnnCacheCapacity,
py::arg("capacity") = 0) py::arg("capacity") = 0)
.def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op) .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op)
.def("enable_mkldnn_int8", &AnalysisConfig::EnableMkldnnInt8, .def("enable_mkldnn_int8",
&AnalysisConfig::EnableMkldnnInt8,
py::arg("mkldnn_int8_enabled_op_types") = py::arg("mkldnn_int8_enabled_op_types") =
std::unordered_set<std::string>({})) std::unordered_set<std::string>({}))
.def("mkldnn_int8_enabled", &AnalysisConfig::mkldnn_int8_enabled) .def("mkldnn_int8_enabled", &AnalysisConfig::mkldnn_int8_enabled)
...@@ -807,17 +841,20 @@ void BindAnalysisPredictor(py::module *m) { ...@@ -807,17 +841,20 @@ void BindAnalysisPredictor(py::module *m) {
.def("prepare_argument", &AnalysisPredictor::PrepareArgument) .def("prepare_argument", &AnalysisPredictor::PrepareArgument)
.def("optimize_inference_program", .def("optimize_inference_program",
&AnalysisPredictor::OptimizeInferenceProgram) &AnalysisPredictor::OptimizeInferenceProgram)
.def("analysis_argument", &AnalysisPredictor::analysis_argument, .def("analysis_argument",
&AnalysisPredictor::analysis_argument,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("clone", &AnalysisPredictor::Clone) .def("clone", &AnalysisPredictor::Clone)
.def("scope", &AnalysisPredictor::scope, .def("scope",
&AnalysisPredictor::scope,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("program", &AnalysisPredictor::program, .def("program",
&AnalysisPredictor::program,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("get_serialized_program", &AnalysisPredictor::GetSerializedProgram) .def("get_serialized_program", &AnalysisPredictor::GetSerializedProgram)
.def("mkldnn_quantize", &AnalysisPredictor::MkldnnQuantize) .def("mkldnn_quantize", &AnalysisPredictor::MkldnnQuantize)
.def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel, .def(
py::arg("dir")); "SaveOptimModel", &AnalysisPredictor::SaveOptimModel, py::arg("dir"));
} }
void BindPaddleInferPredictor(py::module *m) { void BindPaddleInferPredictor(py::module *m) {
...@@ -842,10 +879,12 @@ void BindPaddleInferPredictor(py::module *m) { ...@@ -842,10 +879,12 @@ void BindPaddleInferPredictor(py::module *m) {
void BindZeroCopyTensor(py::module *m) { void BindZeroCopyTensor(py::module *m) {
py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor") py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
.def("reshape", py::overload_cast<const std::vector<int> &>( .def(
&ZeroCopyTensor::Reshape)) "reshape",
.def("reshape", py::overload_cast<const std::size_t &>( py::overload_cast<const std::vector<int> &>(&ZeroCopyTensor::Reshape))
&paddle_infer::Tensor::ReshapeStrings)) .def("reshape",
py::overload_cast<const std::size_t &>(
&paddle_infer::Tensor::ReshapeStrings))
.def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>) .def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>) .def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<float>) .def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
...@@ -860,10 +899,12 @@ void BindZeroCopyTensor(py::module *m) { ...@@ -860,10 +899,12 @@ void BindZeroCopyTensor(py::module *m) {
void BindPaddleInferTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) {
py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor") py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
.def("reshape", py::overload_cast<const std::vector<int> &>( .def("reshape",
&paddle_infer::Tensor::Reshape)) py::overload_cast<const std::vector<int> &>(
.def("reshape", py::overload_cast<const std::size_t &>( &paddle_infer::Tensor::Reshape))
&paddle_infer::Tensor::ReshapeStrings)) .def("reshape",
py::overload_cast<const std::size_t &>(
&paddle_infer::Tensor::ReshapeStrings))
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>) .def("copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>) .def("copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<float>) .def("copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
...@@ -881,7 +922,8 @@ void BindPaddleInferTensor(py::module *m) { ...@@ -881,7 +922,8 @@ void BindPaddleInferTensor(py::module *m) {
void BindPredictorPool(py::module *m) { void BindPredictorPool(py::module *m) {
py::class_<paddle_infer::services::PredictorPool>(*m, "PredictorPool") py::class_<paddle_infer::services::PredictorPool>(*m, "PredictorPool")
.def(py::init<const paddle_infer::Config &, size_t>()) .def(py::init<const paddle_infer::Config &, size_t>())
.def("retrive", &paddle_infer::services::PredictorPool::Retrive, .def("retrive",
&paddle_infer::services::PredictorPool::Retrive,
py::return_value_policy::reference); py::return_value_policy::reference);
} }
...@@ -904,7 +946,8 @@ void BindPaddlePassBuilder(py::module *m) { ...@@ -904,7 +946,8 @@ void BindPaddlePassBuilder(py::module *m) {
.def("append_analysis_pass", &PaddlePassBuilder::AppendAnalysisPass) .def("append_analysis_pass", &PaddlePassBuilder::AppendAnalysisPass)
.def("turn_on_debug", &PaddlePassBuilder::TurnOnDebug) .def("turn_on_debug", &PaddlePassBuilder::TurnOnDebug)
.def("debug_string", &PaddlePassBuilder::DebugString) .def("debug_string", &PaddlePassBuilder::DebugString)
.def("all_passes", &PaddlePassBuilder::AllPasses, .def("all_passes",
&PaddlePassBuilder::AllPasses,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("analysis_passes", &PaddlePassBuilder::AnalysisPasses); .def("analysis_passes", &PaddlePassBuilder::AnalysisPasses);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册