// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include "lite/core/context.h" #include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/pool_compute.h" #endif // LITE_WITH_ARM DEFINE_int32(power_mode, 3, "power mode: " "0 for POWER_HIGH;" "1 for POWER_LOW;" "2 for POWER_FULL;" "3 for NO_BIND"); DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(check_result, true, "check the result"); DEFINE_int32(batch, 1, "batch size"); DEFINE_int32(in_channel, 32, "input channel"); DEFINE_int32(in_height, 112, "input height"); DEFINE_int32(in_width, 112, "input width"); DEFINE_int32(kernel_h, 3, "kernel height"); DEFINE_int32(kernel_w, 3, "kernel width"); DEFINE_int32(pad_h, 1, "pad height"); DEFINE_int32(pad_w, 1, "pad width"); DEFINE_int32(stride_h, 1, "stride height"); DEFINE_int32(stride_w, 1, "stride width"); DEFINE_bool(ceil_mode, true, "do ceil_mode"); DEFINE_bool(flag_global, true, "global pooling"); DEFINE_bool(exclusive, true, "do exclusive"); DEFINE_bool(adaptive, false, "no do adaptive"); DEFINE_bool(use_quantizer, false, "no do use_quantizer"); DEFINE_string(pooling_type, "max", "do max pooling"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::PoolParam PoolParam; using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::PoolParam& param) { DDim dim_out = dim_in; auto kernel_h = param.ksize[0]; auto kernel_w = param.ksize[1]; auto h = dim_in[2]; auto w = dim_in[3]; auto paddings = *param.paddings; int stride_h = param.strides[0]; int stride_w = param.strides[1]; bool ceil_mode = param.ceil_mode; bool flag_global = param.global_pooling; int hout = 1; int wout = 1; if (!flag_global) { if (!ceil_mode) { hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1; wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1; } else { hout = (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h + 1; wout = (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w + 1; } } dim_out[2] = hout; dim_out[3] = wout; return dim_out; } void pooling_basic(const float* din, float* dout, int num, int chout, int hout, int wout, int chin, int hin, int win, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, bool global_pooling, bool exclusive, bool adaptive, bool ceil_mode, bool use_quantizer, const std::string& pooling_type) { // no need to pad input tensor, border is zero pad inside this function memset(dout, 0, num * chout * hout * wout * sizeof(float)); int kernel_h = ksize[0]; int kernel_w = ksize[1]; int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { if (pooling_type == "max") { // Pooling_max for (int n = 0; n < num; ++n) { float* dout_batch = dout + n * chout * size_channel_out; const float* din_batch = din + n * chin * size_channel_in; #pragma omp parallel for for (int c = 0; c < chout; ++c) { const float* din_ch = din_batch + c * size_channel_in; // in address float tmp1 = din_ch[0]; for (int i = 0; i < size_channel_in; ++i) { float tmp2 = din_ch[i]; tmp1 = tmp1 > tmp2 ? tmp1 : tmp2; } dout_batch[c] = tmp1; } } } else if (pooling_type == "avg") { // Pooling_average_include_padding // Pooling_average_exclude_padding for (int n = 0; n < num; ++n) { float* dout_batch = dout + n * chout * size_channel_out; const float* din_batch = din + n * chin * size_channel_in; #pragma omp parallel for for (int c = 0; c < chout; ++c) { const float* din_ch = din_batch + c * size_channel_in; // in address float sum = 0.f; for (int i = 0; i < size_channel_in; ++i) { sum += din_ch[i]; } dout_batch[c] = sum / size_channel_in; } } } else { LOG(FATAL) << "unsupported pooling type: " << pooling_type; } } else { for (int ind_n = 0; ind_n < num; ++ind_n) { #pragma omp parallel for for (int ind_c = 0; ind_c < chin; ++ind_c) { for (int ind_h = 0; ind_h < hout; ++ind_h) { int sh = ind_h * stride_h; int eh = sh + kernel_h; sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; eh = (eh - pad_h) > hin ? hin : eh - pad_h; for (int ind_w = 0; ind_w < wout; ++ind_w) { int sw = ind_w * stride_w; int ew = sw + kernel_w; sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; ew = (ew - pad_w) > win ? win : ew - pad_w; float result = static_cast(0); int dst_ind = (ind_n * chout + ind_c) * size_channel_out + ind_h * wout + ind_w; for (int kh = sh; kh < eh; ++kh) { for (int kw = sw; kw < ew; ++kw) { int src_ind = (ind_n * chin + ind_c) * size_channel_in + kh * win + kw; if (kh == sh && kw == sw) { result = din[src_ind]; } else { if (pooling_type == "max") { result = result >= din[src_ind] ? result : din[src_ind]; } else if (pooling_type == "avg") { result += din[src_ind]; } } } } if (pooling_type == "avg") { if (exclusive) { int div = (ew - sw) * (eh - sh); div = div > 0 ? div : 1; result /= div; } else { int bh = kernel_h; int bw = kernel_w; if (ew == win) { bw = (sw + kernel_w) >= (win + paddings[3]) ? (win + paddings[3]) : (sw + kernel_w); bw -= sw; if ((sw - pad_w) < 0 && (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { bh = (sh + kernel_h) >= (hin + paddings[1]) ? (hin + paddings[1]) : (sh + kernel_h); bh -= sh; if ((sh - pad_h) < 0 && (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } result /= bh * bw; } } dout[dst_ind] = result; } } } } } } #ifdef LITE_WITH_ARM void test_pool_fp32(const std::vector& input_dims, const std::vector& ksize, const std::vector& strides, const std::vector& pads, bool ceil_mode, bool flag_global, bool exclusive, bool adaptive, bool use_quantizer, std::string pooling_type, const std::vector& thread_num, const std::vector& power_mode) { #ifdef LITE_WITH_ARM paddle::lite::DeviceInfo::Init(); #endif PoolParam param; param.x = new Tensor; param.x->set_precision(PRECISION(kFloat)); param.ksize = ksize; param.strides = strides; param.paddings = std::make_shared>(pads); param.ceil_mode = ceil_mode; param.global_pooling = flag_global; param.pooling_type = pooling_type; param.exclusive = exclusive; param.adaptive = adaptive; param.use_quantizer = use_quantizer; param.output = new Tensor; param.output->set_precision(PRECISION(kFloat)); for (auto& cls : power_mode) { for (auto& th : thread_num) { paddle::lite::kernels::arm::PoolCompute pool; std::unique_ptr ctx1( new paddle::lite::KernelContext); auto& ctx = ctx1->As(); ctx.SetRunMode(static_cast(cls), th); /// set param and context pool.SetParam(param); pool.SetContext(std::move(ctx1)); /// prepare for run pool.PrepareForRun(); for (auto& dim_in : input_dims) { DDim dim_out = compute_out_dim(dim_in, param); if (dim_out[2] < 1 || dim_out[3] < 1) { continue; } param.x->Resize(dim_in); param.output->Resize(dim_out); paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); // paddle::lite::fill_tensor_const(*param.x, 1.f); auto din = param.x->data(); Tensor tout_basic; if (FLAGS_check_result) { LOG(INFO) << "basic compute"; tout_basic.set_precision(PRECISION(kFloat)); tout_basic.Resize(dim_out); fill_tensor_const(tout_basic, 0.f); auto dout_basic = tout_basic.mutable_data(); pooling_basic(din, dout_basic, dim_in[0], dim_out[1], dim_out[2], dim_out[3], dim_in[1], dim_in[2], dim_in[3], ksize, strides, pads, flag_global, exclusive, adaptive, ceil_mode, use_quantizer, pooling_type); } LOG(INFO) << "lite compute"; /// warm up for (int i = 0; i < FLAGS_warmup; ++i) { pool.Launch(); } /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { t0.Start(); pool.Launch(); t0.Stop(); } double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1]; LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape" << dim_out << ", running time, avg: " << t0.LapTimes().Avg() << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; double max_diff = 0; tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff); LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; if (std::abs(max_ratio) > 1e-3f) { if (max_diff > 5e-4f) { LOG(WARNING) << "din"; print_tensor(*param.x); LOG(WARNING) << "basic result"; print_tensor(tout_basic); LOG(WARNING) << "lite result"; print_tensor(*param.output); Tensor tdiff; tdiff.Resize(tout_basic.dims()); tdiff.set_precision(PRECISION(kFloat)); tensor_diff(tout_basic, *param.output, tdiff); print_tensor(tdiff); LOG(FATAL) << "test fp32 pool: input: " << dim_in << ", output: " << dim_out << ", kernel dim: " << ksize[0] << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", global_pooling: " << (flag_global ? "global" : "false") << ", pooling_type: " << pooling_type << ", ceil_mode: " << (ceil_mode ? "true" : "false") << ", exclusive: " << (exclusive ? "true" : "false") << ", threads: " << th << ", power_mode: " << cls << " failed!!\n"; } } } LOG(INFO) << "test fp32 pool: input: " << dim_in << ", output: " << dim_out << ", kernel dim: " << ksize[0] << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", global_pooling: " << (flag_global ? "global" : "false") << ", pooling_type: " << pooling_type << ", ceil_mode: " << (ceil_mode ? "true" : "false") << ", exclusive: " << (exclusive ? "true" : "false") << ", threads: " << th << ", power_mode: " << cls << " successed!!\n"; } } } delete param.x; delete param.output; } #else void test_pool_fp32(const std::vector& input_dims, const std::vector& ksize, const std::vector& strides, const std::vector& pads, bool ceil_mode, bool flag_global, bool exclusive, bool adaptive, bool use_quantizer, std::string pooling_type, const std::vector& thread_num, const std::vector& power_mode) {} #endif // LITE_WITH_ARM #if 1 /// random param pool TEST(TestPoolRand, test_pool_rand) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 16}) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { for (auto& pad_top : {0, 1, 2}) { for (auto& pad_bottom : {0, 1, 2}) { for (auto& pad_left : {0, 1, 2}) { for (auto& pad_right : {0, 1, 2}) { for (auto& flag_global : {false, true}) { for (auto& exclusive : {false, true}) { for (auto& ceil_mode : {false, true}) { for (auto& pooling_type : {"max", "avg"}) { bool adaptive = false; bool use_quantizer = false; std::vector dims; for (auto& batch : {1, 2}) { for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) { dims.push_back(DDim({batch, cin, h, h})); } } test_pool_fp32( dims, {kh, kw}, {stride, stride}, {pad_top, pad_bottom, pad_left, pad_right}, ceil_mode, flag_global, exclusive, adaptive, use_quantizer, pooling_type, {4}, {FLAGS_power_mode}); } } } } } } } } } } } } } } #endif /// random param conv #if 1 /// custom TEST(TesPoolCustom, test_pool_fp32_custom_size) { test_pool_fp32( {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})}, {FLAGS_kernel_h, FLAGS_kernel_w}, {FLAGS_stride_h, FLAGS_stride_w}, {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, FLAGS_ceil_mode, FLAGS_flag_global, FLAGS_exclusive, FLAGS_adaptive, FLAGS_use_quantizer, FLAGS_pooling_type, {FLAGS_threads}, {FLAGS_power_mode}); } #endif // custom