From 9bea65cbfa777e20bca187f965cca1e5330d281b Mon Sep 17 00:00:00 2001 From: ysh329 Date: Thu, 24 Sep 2020 07:15:00 -0500 Subject: [PATCH] [cherry-pick][BugFix][OPENCL] BugFix for OpenCL: image memory malloc; dropout kernel register; precision profiler enhance; layout pass bugfix for opencl (#4426) --- lite/core/memory.h | 18 +- lite/core/memory_test.cc | 6 + lite/core/mir/type_layout_cast_pass.cc | 5 +- lite/core/optimizer.h | 177 +++++++++--------- lite/core/profile/precision_profiler.h | 170 +++++++++++++---- lite/core/program.cc | 4 +- .../cxx/mobile_light/mobilenetv1_light_api.cc | 105 ++++++++--- lite/utils/logging.cc | 1 - 8 files changed, 328 insertions(+), 158 deletions(-) diff --git a/lite/core/memory.h b/lite/core/memory.h index c80c8fb6b6..872cfd120c 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include "lite/api/paddle_place.h" #include "lite/core/target_wrapper.h" @@ -140,20 +141,21 @@ class Buffer { #ifdef LITE_WITH_OPENCL template void ResetLazyImage2D(TargetType target, - const size_t img_w, - const size_t img_h, + const size_t img_w_req, + const size_t img_h_req, void* host_ptr = nullptr) { - if (target != target_ || cl_image2d_width_ < img_w || - cl_image2d_height_ < img_h || host_ptr != nullptr) { + if (target != target_ || cl_image2d_width_ < img_w_req || + cl_image2d_height_ < img_h_req || host_ptr != nullptr) { CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; + cl_image2d_width_ = std::max(cl_image2d_width_, img_w_req); + cl_image2d_height_ = std::max(cl_image2d_height_, img_h_req); Free(); - data_ = TargetWrapperCL::MallocImage(img_w, img_h, host_ptr); + data_ = TargetWrapperCL::MallocImage( + cl_image2d_width_, cl_image2d_height_, host_ptr); target_ = target; - space_ = sizeof(T) * img_w * img_h * + space_ = sizeof(T) * cl_image2d_width_ * cl_image2d_height_ * 4; // un-used for opencl Image2D, 4 for RGBA, cl_use_image2d_ = true; - cl_image2d_width_ = img_w; - cl_image2d_height_ = img_h; } } #endif diff --git a/lite/core/memory_test.cc b/lite/core/memory_test.cc index cd9062afca..6343854db2 100644 --- a/lite/core/memory_test.cc +++ b/lite/core/memory_test.cc @@ -28,6 +28,12 @@ TEST(memory, test) { ASSERT_TRUE(buf_cuda); TargetFree(TARGET(kCUDA), buf_cuda); #endif + +#ifdef LITE_WITH_OPENCL + auto* buf_cl = TargetMalloc(TARGET(kOpenCL), 10); + ASSERT_TRUE(buf_cl); + TargetFree(TARGET(kOpenCL), buf_cl); +#endif } } // namespace lite diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 44b6eaf1eb..c1529aacf8 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -82,8 +82,11 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, // not a good judge, but don't find the source of this issue from // static_pick_kernel_pass // to this pass. + auto is_host = [](TargetType x) -> bool { + return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM); + }; auto* in_arg_type = const_cast(in->AsArg().type); - if (in_arg_type->target() == TARGET(kARM) && + if (is_host(in_arg_type->target()) && in_arg_type->layout() == DATALAYOUT(kImageDefault)) { return; } diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 2dfc444a26..7b12b32b69 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -80,97 +80,98 @@ class Optimizer { InitControlFlowOpUnusedInputsAndOutputsEliminatePass(); if (passes.empty() || passes.size() == 1) { - std::vector passes_local{ - {"lite_quant_dequant_fuse_pass", // - "weight_quantization_preprocess_pass", // - "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn - "lite_conv_bn_fuse_pass", // - "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise - "lite_conv_conv_fuse_pass", // - // TODO(Superjomn) Refine the fusion related design to select fusion - // kernels for devices automatically. - "lite_conv_activation_fuse_pass", // - "lite_var_conv_2d_activation_fuse_pass", // - "lite_match_matrix_activation_fuse_pass", // - "lite_fc_fuse_pass", // - "lite_shuffle_channel_fuse_pass", // - "lite_transpose_softmax_transpose_fuse_pass", // - "lite_interpolate_fuse_pass", // - "identity_scale_eliminate_pass", // - "lite_scales_fuse_pass", // - "lite_sequence_reverse_embedding_fuse_pass", // - "elementwise_mul_constant_eliminate_pass", // - "lite_sequence_pool_concat_fuse_pass", // - "lite_scale_activation_fuse_pass", // + std::vector passes_local{{ + "lite_quant_dequant_fuse_pass", // + "weight_quantization_preprocess_pass", // + "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn + "lite_conv_bn_fuse_pass", // + "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise + "lite_conv_conv_fuse_pass", // + // TODO(Superjomn) Refine the fusion related design to select fusion + // kernels for devices automatically. + "lite_conv_activation_fuse_pass", // + "lite_var_conv_2d_activation_fuse_pass", // + "lite_match_matrix_activation_fuse_pass", // + "lite_fc_fuse_pass", // + "lite_shuffle_channel_fuse_pass", // + "lite_transpose_softmax_transpose_fuse_pass", // + "lite_interpolate_fuse_pass", // + "identity_scale_eliminate_pass", // + "lite_scales_fuse_pass", // + "lite_sequence_reverse_embedding_fuse_pass", // + "elementwise_mul_constant_eliminate_pass", // + "lite_sequence_pool_concat_fuse_pass", // + "lite_scale_activation_fuse_pass", // #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ (defined LITE_WITH_ARM) - "lite_elementwise_activation_fuse_pass", // + "lite_elementwise_activation_fuse_pass", // #endif - "identity_dropout_eliminate_pass", - "__xpu__resnet_fuse_pass", - "__xpu__resnet_cbam_fuse_pass", - "__xpu__conv2d_fuse_pass", - "__xpu__conv2d_link_previous_out_max_pass", - "__xpu__sfa_head_meanstd_fuse_pass", - "__xpu__sfa_head_moment_fuse_pass", - "__xpu__mmdnn_fuse_pass", - "__xpu__multi_encoder_fuse_pass", - "__xpu__embedding_with_eltwise_add_fuse_pass", - "__xpu__fc_fuse_pass", - "quantized_op_attributes_inference_pass", // Only for fully - // quantized model, infer - // the output scale and - // fix the attribute - // 'enable_int8' for all - // of the quantized ops. - "npu_subgraph_pass", - "huawei_ascend_npu_subgraph_pass", - "xpu_subgraph_pass", - "bm_subgraph_pass", - "apu_subgraph_pass", - "rknpu_subgraph_pass", - "mlu_subgraph_pass", - "control_flow_op_unused_inputs_and_outputs_eliminate_pass", - "static_kernel_pick_pass", // pick original kernel from graph - - "remove_tf_redundant_ops_pass", - "variable_place_inference_pass", // inference arg/var's - - "mlu_postprocess_pass", - // info(target/precision/layout/device) - // using kernel info - "argument_type_display_pass", // debug pass: show arg-type-node's - // info - // (target/precision/layout/device) - - "type_target_cast_pass", // add io_copy/io_copy_once if meet - // different targets when last and next - // node - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "io_copy_kernel_pick_pass", // - "argument_type_display_pass", // - - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "type_precision_cast_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "type_layout_cast_pass", // add layout/layout_once op if meet - // different layout when last and next node - "argument_type_display_pass", // - - "variable_place_inference_pass", // - "argument_type_display_pass", - - "runtime_context_assign_pass", - "argument_type_display_pass", - "lite_reshape_fuse_pass", - - "memory_optimize_pass"}}; + "identity_dropout_eliminate_pass", + "__xpu__resnet_fuse_pass", + "__xpu__resnet_cbam_fuse_pass", + "__xpu__conv2d_fuse_pass", + "__xpu__conv2d_link_previous_out_max_pass", + "__xpu__sfa_head_meanstd_fuse_pass", + "__xpu__sfa_head_moment_fuse_pass", + "__xpu__mmdnn_fuse_pass", + "__xpu__multi_encoder_fuse_pass", + "__xpu__embedding_with_eltwise_add_fuse_pass", + "__xpu__fc_fuse_pass", + "quantized_op_attributes_inference_pass", // Only for fully + // quantized model, infer + // the output scale and + // fix the attribute + // 'enable_int8' for all + // of the quantized ops. + "npu_subgraph_pass", + "huawei_ascend_npu_subgraph_pass", + "xpu_subgraph_pass", + "bm_subgraph_pass", + "apu_subgraph_pass", + "rknpu_subgraph_pass", + "mlu_subgraph_pass", + "control_flow_op_unused_inputs_and_outputs_eliminate_pass", + "static_kernel_pick_pass", // pick original kernel from graph + + "remove_tf_redundant_ops_pass", + "variable_place_inference_pass", // inference arg/var's + + "mlu_postprocess_pass", + // info(target/precision/layout/device) + // using kernel info + "argument_type_display_pass", // debug pass: show arg-type-node's + // info + // (target/precision/layout/device) + + "type_target_cast_pass", // add io_copy/io_copy_once if meet + // different targets when last and next + // node + "variable_place_inference_pass", // + "argument_type_display_pass", // + + "io_copy_kernel_pick_pass", // + "argument_type_display_pass", // + + "variable_place_inference_pass", // + "argument_type_display_pass", // + + "type_precision_cast_pass", // + "variable_place_inference_pass", // + "argument_type_display_pass", // + + "type_layout_cast_pass", // add layout/layout_once op if meet + // different layout when last and next node + "argument_type_display_pass", // + + "variable_place_inference_pass", // + "argument_type_display_pass", + + "runtime_context_assign_pass", + "argument_type_display_pass", + "lite_reshape_fuse_pass", + "memory_optimize_pass" // you can comment this line when enable + // PRECISION_PROFILE + }}; if (passes.size() == 1) { // multi_stream_analysis_pass must be in the front of diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index fda2b74f8f..5ad541ad7c 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -18,10 +18,18 @@ * of each kernel. */ #pragma once + +#include +#include + #include +#include +#include +#include #include #include #include "lite/core/program.h" +#include "lite/utils/io.h" #ifdef LITE_WITH_X86 #include "lite/fluid/float16.h" #endif @@ -40,14 +48,50 @@ namespace paddle { namespace lite { namespace profile { +static const std::string get_date_str() { + struct tm tm_time; + time_t timestamp = time(NULL); + localtime_r(×tamp, &tm_time); + struct timeval tv; + gettimeofday(&tv, NULL); + + // print date / time + std::string date_str = + std::to_string(1900 + tm_time.tm_year) + + std::to_string(1 + tm_time.tm_mon) + std::to_string(tm_time.tm_mday) + + '_' + std::to_string(tm_time.tm_hour) + std::to_string(tm_time.tm_min) + + std::to_string(tm_time.tm_sec) + '_' + std::to_string(tv.tv_usec / 1000); + return date_str; +} + +inline std::string generate_valid_tensor_name(const std::string& name) { + std::string new_name(""); + for (size_t i = 0; i < name.length(); ++i) { + if (name[i] != '/') { + new_name += name[i]; + } else { + new_name += "_"; + } + } + return new_name; +} + template -static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { - if (locate.find('/') != std::string::npos) { - return false; +static bool write_tensorfile( + const Tensor* tensor, + const std::string& tensor_name, + const std::string prefix_path = "/storage/emulated/0/") { + std::string new_tensor_name = generate_valid_tensor_name(tensor_name); + if (tensor_name.find('/') != std::string::npos) { + LOG(ERROR) << "--> tensor name is abnormal with '\\':" << tensor_name + << " !!!, replace with '_'," << new_tensor_name + << new_tensor_name; } - FILE* fp = fopen(locate.c_str(), "w"); + + std::string tensor_save_path = prefix_path + new_tensor_name + ".txt"; + FILE* fp = fopen(tensor_save_path.c_str(), "w"); if (fp == nullptr) { - LOG(ERROR) << "file open field " << locate; + LOG(ERROR) << "failed open file " << tensor_save_path; return false; } else { const dtype* data = tensor->data(); @@ -56,19 +100,23 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { } } fclose(fp); + LOG(INFO) << "write tensor " << tensor_name + << " to file:" << tensor_save_path; return true; } -static bool write_precision_summary_tofile(const std::string& string, - const std::string& log_dir = "") { - if (log_dir == "") { - LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:" - << log_dir; +static bool write_precision_summary_tofile( + const std::string& string, const std::string& summary_log_dir = "") { + if (summary_log_dir == "") { + LOG(INFO) << "The `summary_log_dir` of precision summary file is not set. " + "summary_log_dir:" + << summary_log_dir; return false; } - FILE* fp = fopen(log_dir.c_str(), "a"); + + FILE* fp = fopen(summary_log_dir.c_str(), "a"); if (fp == nullptr) { - LOG(INFO) << "Open precision summary file:" << log_dir << "failed."; + LOG(INFO) << "Open precision summary file:" << summary_log_dir << "failed."; return false; } else { fprintf(fp, "%s\n", string.c_str()); @@ -85,7 +133,14 @@ class PrecisionProfiler { std::string inst_precison_str = GetInstPrecision(inst); } - PrecisionProfiler() {} + PrecisionProfiler() { + MkDirRecur(log_dir_); + const char* write_to_file_raw = + std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE"); + write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0) + ? atoi(write_to_file_raw) > 0 + : false; + } std::string GetSummaryHeader() { using std::setw; @@ -102,9 +157,9 @@ class PrecisionProfiler { << " " << setw(15) << left << "std_deviation" << " " << setw(15) << left << "ave_grow_rate*" << std::endl; - // write to file with path: `log_dir` - if (log_dir_ != "") { - FILE* fp = fopen(log_dir_.c_str(), "a"); + // write to file with path: `summary_log_dir` + if (summary_log_dir_ != "") { + FILE* fp = fopen(summary_log_dir_.c_str(), "a"); std::string header_str{ss.str()}; fprintf(fp, "%s\n", header_str.c_str()); fclose(fp); @@ -112,6 +167,18 @@ class PrecisionProfiler { return ss.str(); } + std::string GetSummaryTail() { + STL::stringstream ss; + ss << "[note]" << std::endl; + ss << "1. `ave_grow_rate`: show the sequence value of tensor when std_dev " + "& mean are same." + << std::endl; + ss << "2. Enable write each output tensor to file: `export " + "PADDLELITE_PRECISION_WRITE_TO_FILE=1` on ADB command line." + << std::endl; + return ss.str(); + } + template double compute_mean(const T* in, const size_t length) { double sum = 0.; @@ -157,6 +224,17 @@ class PrecisionProfiler { return false; } + std::string rename_out_for_mem_reuse_pass(const std::string& old_name) { + if (out_tensor_names_map.find(old_name) == out_tensor_names_map.end()) { + out_tensor_names_map[old_name] = 1; + } else { + ++out_tensor_names_map[old_name]; + } + std::string new_name = + old_name + "_" + std::to_string(out_tensor_names_map[old_name]); + return new_name; + } + void compute_tensor_precision_info(const Tensor* in, TargetType target_type, PrecisionType precision_type, @@ -180,7 +258,7 @@ class PrecisionProfiler { *std_dev = compute_standard_deviation(ptr, in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kAny): { @@ -189,7 +267,7 @@ class PrecisionProfiler { *std_dev = compute_standard_deviation(ptr, in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kInt8): { @@ -198,7 +276,7 @@ class PrecisionProfiler { *std_dev = compute_standard_deviation(ptr, in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kInt32): { @@ -207,7 +285,7 @@ class PrecisionProfiler { *std_dev = compute_standard_deviation( ptr, in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kInt64): { @@ -254,7 +332,14 @@ class PrecisionProfiler { real_out_v.data(), in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(real_out_v.data(), real_out_v.size()); - write_result_to_file&& write_tensorfile(in, name); + std::shared_ptr real_out_t(new lite::Tensor); + real_out_t->Resize(in->dims()); + float* real_out_data = real_out_t->mutable_data(); + memcpy(real_out_data, + real_out_v.data(), + real_out_v.size() * sizeof(float)); + write_result_to_file&& write_tensorfile( + real_out_t.get(), name, log_dir_); return; } case DATALAYOUT(kNCHW): { @@ -269,7 +354,14 @@ class PrecisionProfiler { in_data_v.data(), in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(in_data_v.data(), in->numel()); - write_result_to_file&& write_tensorfile(in, name); + std::shared_ptr real_out_t(new lite::Tensor); + real_out_t->Resize(in->dims()); + float* real_out_data = real_out_t->mutable_data(); + memcpy(real_out_data, + in_data_v.data(), + in_data_v.size() * sizeof(float)); + write_result_to_file&& write_tensorfile( + real_out_t.get(), name, log_dir_); return; } default: @@ -296,7 +388,7 @@ class PrecisionProfiler { in_data_v.data(), in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(in_data_v.data(), in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kInt32): { @@ -311,7 +403,7 @@ class PrecisionProfiler { in_data_v.data(), in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(in_data_v.data(), in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kInt64): { @@ -326,7 +418,7 @@ class PrecisionProfiler { in_data_v.data(), in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(in_data_v.data(), in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } case PRECISION(kFP16): { @@ -347,7 +439,7 @@ class PrecisionProfiler { in_data_v.data(), in->numel(), true, *mean); *ave_grow_rate = compute_average_grow_rate(in_data_v.data(), in->numel()); - write_result_to_file&& write_tensorfile(in, name); + write_result_to_file&& write_tensorfile(in, name, log_dir_); return; } default: @@ -372,12 +464,12 @@ class PrecisionProfiler { using std::left; using std::fixed; STL::stringstream ss; - bool write_result_to_file = false; VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr() << " registered on " << TargetToStr(inst->kernel()->target()) << "/" << PrecisionToStr(inst->kernel()->precision()) << "/" - << DataLayoutToStr(inst->kernel()->layout()); + << DataLayoutToStr(inst->kernel()->layout()) + << ", write_result_to_file_:" << write_result_to_file_; std::string kernel_repr = inst->op()->op_info()->Repr(); std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" + @@ -404,6 +496,7 @@ class PrecisionProfiler { std::string mean_str{"unused"}; std::string std_dev_str{"unused"}; std::string ave_grow_rate_str{"unused"}; + std::string new_out_name = rename_out_for_mem_reuse_pass(out_name); if (!is_unused(tout)) { compute_tensor_precision_info(tout, @@ -413,14 +506,14 @@ class PrecisionProfiler { &mean, &std_dev, &ave_grow_rate, - out_name, - write_result_to_file); + new_out_name, + write_result_to_file_); mean_str = std::to_string(mean); std_dev_str = std::to_string(std_dev); ave_grow_rate_str = std::to_string(ave_grow_rate); } std::string kernel_info = op_name + ":" + kernel_place; - std::string output_arg_info = out_name + ":" + + std::string output_arg_info = new_out_name + ":" + TargetToStr(type->target()) + "/" + PrecisionToStr(type->precision()) + "/" + DataLayoutToStr(type->layout()); @@ -441,6 +534,7 @@ class PrecisionProfiler { std::string mean_str{"unused"}; std::string std_dev_str{"unused"}; std::string ave_grow_rate_str{"unused"}; + std::string new_out_name = rename_out_for_mem_reuse_pass(out_name); if (!is_unused(tout)) { compute_tensor_precision_info(tout, @@ -450,14 +544,14 @@ class PrecisionProfiler { &mean, &std_dev, &ave_grow_rate, - out_name, - write_result_to_file); + new_out_name, + write_result_to_file_); mean_str = std::to_string(mean); std_dev_str = std::to_string(std_dev); ave_grow_rate_str = std::to_string(ave_grow_rate); } std::string kernel_info = op_name + ":" + kernel_place; - std::string output_arg_info = out_name + ":" + + std::string output_arg_info = new_out_name + ":" + TargetToStr(type->target()) + "/" + PrecisionToStr(type->precision()) + "/" + DataLayoutToStr(type->layout()); @@ -471,12 +565,16 @@ class PrecisionProfiler { } } } - write_precision_summary_tofile(ss.str(), log_dir_); + write_precision_summary_tofile(ss.str(), summary_log_dir_); return ss.str(); } private: - std::string log_dir_{"/storage/emulated/0/precision.log"}; + std::string log_dir_{"/storage/emulated/0/PaddleLite_" + get_date_str() + + "/"}; + std::string summary_log_dir_{log_dir_ + "precision_summary.log"}; + std::map out_tensor_names_map; + bool write_result_to_file_{false}; }; } // namespace profile diff --git a/lite/core/program.cc b/lite/core/program.cc index 14855e778d..c8ecb06433 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -302,7 +302,9 @@ void RuntimeProgram::Run() { LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1); #endif #ifdef LITE_WITH_PRECISION_PROFILE - LOG(INFO) << "\n" << precision_profiler_summary; + LOG(INFO) << "\n" + << precision_profiler_summary + << inst_precision_profiler.GetSummaryTail(); #endif } diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index 6427f4c46d..73a5ea7655 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -29,6 +29,21 @@ int64_t ShapeProduction(const shape_t& shape) { return res; } +std::string ShapePrint(const std::vector& shapes) { + std::string shapes_str{""}; + for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) { + auto shape = shapes[shape_idx]; + std::string shape_str; + for (auto i : shape) { + shape_str += std::to_string(i) + ","; + } + shapes_str += shape_str; + shapes_str += + (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : "; + } + return shapes_str; +} + std::string ShapePrint(const shape_t& shape) { std::string shape_str{""}; for (auto i : shape) { @@ -37,6 +52,37 @@ std::string ShapePrint(const shape_t& shape) { return shape_str; } +std::vector split_string(const std::string& str_in) { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return str_out; +} + +std::vector get_shape(const std::string& str_shape) { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; +} + template double compute_mean(const T* in, const size_t length) { double sum = 0.; @@ -70,7 +116,7 @@ inline double GetCurrentUS() { } void RunModel(std::string model_dir, - const shape_t& input_shape, + const std::vector& input_shapes, size_t repeats, size_t warmup, size_t print_output_elem, @@ -111,12 +157,19 @@ void RunModel(std::string model_dir, CreatePaddlePredictor(config); // 3. Prepare input data - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize( - {input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; + std::cout << "input_shapes.size():" << input_shapes.size() << std::endl; + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + + for (int i = 0; i < input_num; ++i) { + input_data[i] = 1.f; + } } // 4. Run predictor @@ -142,7 +195,7 @@ void RunModel(std::string model_dir, } avg_duration = sum_duration / static_cast(repeats); std::cout << "\n======= benchmark summary =======\n" - << "input_shape(NCHW):" << ShapePrint(input_shape) << "\n" + << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n" << "model_dir:" << model_dir << "\n" << "warmup:" << warmup << "\n" << "repeats:" << repeats << "\n" @@ -184,18 +237,19 @@ void RunModel(std::string model_dir, } int main(int argc, char** argv) { - shape_t input_shape{1, 3, 224, 224}; // shape_t ==> std::vector + std::vector str_input_shapes; + std::vector input_shapes{ + {1, 3, 224, 224}}; // shape_t ==> std::vector + int repeats = 10; int warmup = 10; int print_output_elem = 0; - if (argc > 2 && argc < 9) { + if (argc > 2 && argc < 6) { std::cerr << "usage: ./" << argv[0] << "\n" << " \n" - << " \n" - << " \n" - << " \n" - << " \n" + << " , eg: 1,3,224,224 for 1 input; " + "1,3,224,224:1,5 for 2 inputs\n" << " \n" << " \n" << " " << std::endl; @@ -203,14 +257,19 @@ int main(int argc, char** argv) { } std::string model_dir = argv[1]; - if (argc >= 9) { - input_shape[0] = atoi(argv[2]); - input_shape[1] = atoi(argv[3]); - input_shape[2] = atoi(argv[4]); - input_shape[3] = atoi(argv[5]); - repeats = atoi(argv[6]); - warmup = atoi(argv[7]); - print_output_elem = atoi(argv[8]); + if (argc >= 6) { + input_shapes.clear(); + std::string raw_input_shapes = argv[2]; + std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl; + str_input_shapes = split_string(raw_input_shapes); + for (size_t i = 0; i < str_input_shapes.size(); ++i) { + std::cout << "input shape: " << str_input_shapes[i] << std::endl; + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + + repeats = atoi(argv[3]); + warmup = atoi(argv[4]); + print_output_elem = atoi(argv[5]); } // set arm power mode: // 0 for big cluster, high performance @@ -220,7 +279,7 @@ int main(int argc, char** argv) { size_t power_mode = 0; RunModel( - model_dir, input_shape, repeats, warmup, print_output_elem, power_mode); + model_dir, input_shapes, repeats, warmup, print_output_elem, power_mode); return 0; } diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc index cc5a5b408a..768d4e0972 100644 --- a/lite/utils/logging.cc +++ b/lite/utils/logging.cc @@ -35,7 +35,6 @@ void gen_log(STL::ostream& log_stream_, const int kMaxLen) { const int len = strlen(file); - std::string time_str; struct tm tm_time; // Time of creation of LogMessage time_t timestamp = time(NULL); #if defined(_WIN32) -- GitLab