未验证 提交 d353b126 编写于 作者: Y ysh329 提交者: GitHub

[PROFILE] Add ENV var controls whether write output tensor of each op to...

[PROFILE] Add ENV var controls whether write output tensor of each op to files; Rename output tensor name when mem_reuse pass enabled by default etc. (#4348)

*  Add ENV var controls whether write output tensor of each op to files;
*  Rename output tensor name when mem_reuse pass enabled by default etc.
上级 ebc0e39c
......@@ -80,99 +80,98 @@ class Optimizer {
InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
if (passes.empty() || passes.size() == 1) {
std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", //
"weight_quantization_preprocess_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
"lite_conv_conv_fuse_pass", //
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
"lite_var_conv_2d_activation_fuse_pass", //
"lite_match_matrix_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", //
"lite_scales_fuse_pass", //
"lite_sequence_reverse_embedding_fuse_pass", //
"elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
std::vector<std::string> passes_local{{
"lite_quant_dequant_fuse_pass", //
"weight_quantization_preprocess_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
"lite_conv_conv_fuse_pass", //
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
"lite_var_conv_2d_activation_fuse_pass", //
"lite_match_matrix_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", //
"lite_scales_fuse_pass", //
"lite_sequence_reverse_embedding_fuse_pass", //
"elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
(defined LITE_WITH_ARM)
"lite_elementwise_activation_fuse_pass", //
"lite_elementwise_activation_fuse_pass", //
#endif
"identity_dropout_eliminate_pass",
"__xpu__resnet_fuse_pass",
"__xpu__resnet_cbam_fuse_pass",
"__xpu__conv2d_fuse_pass",
"__xpu__conv2d_link_previous_out_max_pass",
"__xpu__sfa_head_meanstd_fuse_pass",
"__xpu__sfa_head_moment_fuse_pass",
"__xpu__mmdnn_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer
// the output scale and
// fix the attribute
// 'enable_int8' for all
// of the quantized ops.
"npu_subgraph_pass",
"huawei_ascend_npu_subgraph_pass",
"xpu_subgraph_pass",
"bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass",
"mlu_subgraph_pass",
"control_flow_op_unused_inputs_and_outputs_eliminate_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
"variable_place_inference_pass", // inference arg/var's
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
// info
// (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next
// node
"variable_place_inference_pass", //
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_precision_cast_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
"lite_reshape_fuse_pass",
#ifndef LITE_WITH_PRECISION_PROFILE
"memory_optimize_pass"
#endif
}};
"identity_dropout_eliminate_pass",
"__xpu__resnet_fuse_pass",
"__xpu__resnet_cbam_fuse_pass",
"__xpu__conv2d_fuse_pass",
"__xpu__conv2d_link_previous_out_max_pass",
"__xpu__sfa_head_meanstd_fuse_pass",
"__xpu__sfa_head_moment_fuse_pass",
"__xpu__mmdnn_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer
// the output scale and
// fix the attribute
// 'enable_int8' for all
// of the quantized ops.
"npu_subgraph_pass",
"huawei_ascend_npu_subgraph_pass",
"xpu_subgraph_pass",
"bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass",
"mlu_subgraph_pass",
"control_flow_op_unused_inputs_and_outputs_eliminate_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
"variable_place_inference_pass", // inference arg/var's
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
// info
// (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next
// node
"variable_place_inference_pass", //
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_precision_cast_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
"lite_reshape_fuse_pass",
"memory_optimize_pass" // you can comment this line when enable
// PRECISION_PROFILE
}};
if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of
......
......@@ -23,6 +23,8 @@
#include <time.h>
#include <cmath>
#include <cstdlib>
#include <map>
#include <memory>
#include <string>
#include <vector>
......@@ -131,7 +133,14 @@ class PrecisionProfiler {
std::string inst_precison_str = GetInstPrecision(inst);
}
PrecisionProfiler() { MkDirRecur(log_dir_); }
PrecisionProfiler() {
MkDirRecur(log_dir_);
const char* write_to_file_raw =
std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
? atoi(write_to_file_raw) > 0
: false;
}
std::string GetSummaryHeader() {
using std::setw;
......@@ -158,6 +167,18 @@ class PrecisionProfiler {
return ss.str();
}
std::string GetSummaryTail() {
STL::stringstream ss;
ss << "[note]" << std::endl;
ss << "1. `ave_grow_rate`: show the sequence value of tensor when std_dev "
"& mean are same."
<< std::endl;
ss << "2. Enable write each output tensor to file: `export "
"PADDLELITE_PRECISION_WRITE_TO_FILE=1` on ADB command line."
<< std::endl;
return ss.str();
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
......@@ -203,6 +224,17 @@ class PrecisionProfiler {
return false;
}
std::string rename_out_for_mem_reuse_pass(const std::string& old_name) {
if (out_tensor_names_map.find(old_name) == out_tensor_names_map.end()) {
out_tensor_names_map[old_name] = 1;
} else {
++out_tensor_names_map[old_name];
}
std::string new_name =
old_name + "_" + std::to_string(out_tensor_names_map[old_name]);
return new_name;
}
void compute_tensor_precision_info(const Tensor* in,
TargetType target_type,
PrecisionType precision_type,
......@@ -432,13 +464,12 @@ class PrecisionProfiler {
using std::left;
using std::fixed;
STL::stringstream ss;
bool write_result_to_file = true;
VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
<< " registered on " << TargetToStr(inst->kernel()->target()) << "/"
<< PrecisionToStr(inst->kernel()->precision()) << "/"
<< DataLayoutToStr(inst->kernel()->layout())
<< ", write_result_to_file:" << write_result_to_file;
<< ", write_result_to_file_:" << write_result_to_file_;
std::string kernel_repr = inst->op()->op_info()->Repr();
std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
......@@ -465,6 +496,7 @@ class PrecisionProfiler {
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
......@@ -474,14 +506,14 @@ class PrecisionProfiler {
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
new_out_name,
write_result_to_file_);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
std::string output_arg_info = new_out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
......@@ -502,6 +534,7 @@ class PrecisionProfiler {
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
......@@ -511,14 +544,14 @@ class PrecisionProfiler {
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
new_out_name,
write_result_to_file_);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
std::string output_arg_info = new_out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
......@@ -540,6 +573,8 @@ class PrecisionProfiler {
std::string log_dir_{"/storage/emulated/0/PaddleLite_" + get_date_str() +
"/"};
std::string summary_log_dir_{log_dir_ + "precision_summary.log"};
std::map<std::string, size_t> out_tensor_names_map;
bool write_result_to_file_{false};
};
} // namespace profile
......
......@@ -302,7 +302,9 @@ void RuntimeProgram::Run() {
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
#endif
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
LOG(INFO) << "\n"
<< precision_profiler_summary
<< inst_precision_profiler.GetSummaryTail();
#endif
}
......
......@@ -29,6 +29,21 @@ int64_t ShapeProduction(const shape_t& shape) {
return res;
}
std::string ShapePrint(const std::vector<shape_t>& shapes) {
std::string shapes_str{""};
for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) {
auto shape = shapes[shape_idx];
std::string shape_str;
for (auto i : shape) {
shape_str += std::to_string(i) + ",";
}
shapes_str += shape_str;
shapes_str +=
(shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : ";
}
return shapes_str;
}
std::string ShapePrint(const shape_t& shape) {
std::string shape_str{""};
for (auto i : shape) {
......@@ -37,6 +52,37 @@ std::string ShapePrint(const shape_t& shape) {
return shape_str;
}
std::vector<std::string> split_string(const std::string& str_in) {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
}
std::vector<int64_t> get_shape(const std::string& str_shape) {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
......@@ -70,7 +116,7 @@ inline double GetCurrentUS() {
}
void RunModel(std::string model_dir,
const shape_t& input_shape,
const std::vector<shape_t>& input_shapes,
size_t repeats,
size_t warmup,
size_t print_output_elem,
......@@ -111,12 +157,19 @@ void RunModel(std::string model_dir,
CreatePaddlePredictor<MobileConfig>(config);
// 3. Prepare input data
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize(
{input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
data[i] = 1;
std::cout << "input_shapes.size():" << input_shapes.size() << std::endl;
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
}
}
// 4. Run predictor
......@@ -142,7 +195,7 @@ void RunModel(std::string model_dir,
}
avg_duration = sum_duration / static_cast<float>(repeats);
std::cout << "\n======= benchmark summary =======\n"
<< "input_shape(NCHW):" << ShapePrint(input_shape) << "\n"
<< "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n"
<< "model_dir:" << model_dir << "\n"
<< "warmup:" << warmup << "\n"
<< "repeats:" << repeats << "\n"
......@@ -184,18 +237,19 @@ void RunModel(std::string model_dir,
}
int main(int argc, char** argv) {
shape_t input_shape{1, 3, 224, 224}; // shape_t ==> std::vector<int64_t>
std::vector<std::string> str_input_shapes;
std::vector<shape_t> input_shapes{
{1, 3, 224, 224}}; // shape_t ==> std::vector<int64_t>
int repeats = 10;
int warmup = 10;
int print_output_elem = 0;
if (argc > 2 && argc < 9) {
if (argc > 2 && argc < 6) {
std::cerr << "usage: ./" << argv[0] << "\n"
<< " <naive_buffer_model_dir>\n"
<< " <input_n>\n"
<< " <input_c>\n"
<< " <input_h>\n"
<< " <input_w>\n"
<< " <raw_input_shapes>, eg: 1,3,224,224 for 1 input; "
"1,3,224,224:1,5 for 2 inputs\n"
<< " <repeats>\n"
<< " <warmup>\n"
<< " <print_output>" << std::endl;
......@@ -203,14 +257,19 @@ int main(int argc, char** argv) {
}
std::string model_dir = argv[1];
if (argc >= 9) {
input_shape[0] = atoi(argv[2]);
input_shape[1] = atoi(argv[3]);
input_shape[2] = atoi(argv[4]);
input_shape[3] = atoi(argv[5]);
repeats = atoi(argv[6]);
warmup = atoi(argv[7]);
print_output_elem = atoi(argv[8]);
if (argc >= 6) {
input_shapes.clear();
std::string raw_input_shapes = argv[2];
std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl;
str_input_shapes = split_string(raw_input_shapes);
for (size_t i = 0; i < str_input_shapes.size(); ++i) {
std::cout << "input shape: " << str_input_shapes[i] << std::endl;
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
repeats = atoi(argv[3]);
warmup = atoi(argv[4]);
print_output_elem = atoi(argv[5]);
}
// set arm power mode:
// 0 for big cluster, high performance
......@@ -220,7 +279,7 @@ int main(int argc, char** argv) {
size_t power_mode = 0;
RunModel(
model_dir, input_shape, repeats, warmup, print_output_elem, power_mode);
model_dir, input_shapes, repeats, warmup, print_output_elem, power_mode);
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册