diff --git a/.gitignore b/.gitignore index 369fa1cb919c82caec326d1429c8a2eba3b928d6..fa01346094773845ba6f11e174774d2f08e47f77 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,10 @@ paddle/fluid/operators/distributed/send_recv.proto *.vs build/ build_doc/ +build.* *.user +*.sh +*.bkp .vscode .idea diff --git a/CMakeLists.txt b/CMakeLists.txt index 036a5faf24f24a50361e16b5810bfc7051f07118..4ef4a4c351e4b701f481b5b23076ea3535fa7231 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(NOT DEFINED TARGET_ARCH_ABI) set(ARCH_ABI "arm64-v8a" CACHE STRING "Choose android platform") endif() - + include(cross_compiling/host) include(cross_compiling/armlinux) include(cross_compiling/android) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 8d3864c6b3da5500bb9017437c3cd16f06494abb..9c955103ba70fc087a267eb748c8db9a3e6e8e40 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -13,13 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_desc.h" -#include #include #include #include // NOLINT #include #include #include +#include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt index ac9ff84da449c6babdb380b54e90920d8cb6e70f..301dbea2b7601d43b20095685d82a11ae5dcc2f6 100644 --- a/paddle/fluid/lite/CMakeLists.txt +++ b/paddle/fluid/lite/CMakeLists.txt @@ -172,3 +172,4 @@ add_subdirectory(model_parser) add_subdirectory(utils) add_subdirectory(api) add_subdirectory(gen_code) + diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt index 9783c55201d46825b7d8f141f18f07a4c24d7795..46f38534c74d7269a440670331f90c33179dffb2 100644 --- a/paddle/fluid/lite/api/CMakeLists.txt +++ b/paddle/fluid/lite/api/CMakeLists.txt @@ -54,3 +54,4 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc mir_passes ${ops_lite} ${host_kernels} ARM_DEPS ${arm_kernels}) + diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc index f53f6105d1bf8abdce928ad8fb8fc36ac79935c6..0cc786c024f6d7447ec57bb4a539ddf8bcdb1c25 100644 --- a/paddle/fluid/lite/api/cxx_api_bin.cc +++ b/paddle/fluid/lite/api/cxx_api_bin.cc @@ -32,9 +32,9 @@ void Run(const char* model_dir) { valid_places); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({100, 100}))); + input_tensor->Resize(DDim(std::vector({3, 224, 224}))); auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { + for (int i = 0; i < 3 * 224 * 224; i++) { data[i] = i; } @@ -65,6 +65,14 @@ USE_LITE_OP(feed); USE_LITE_OP(fetch); USE_LITE_OP(io_copy); +USE_LITE_OP(con2d); +// USE_LITE_OP(batch_norm); +USE_LITE_OP(relu); +USE_LITE_OP(depthwise_conv2d); +USE_LITE_OP(pool2d); +USE_LITE_OP(elementwise_add); +USE_LITE_OP(softmax); + USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); @@ -72,7 +80,15 @@ USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); + +USE_LITE_KERNEL(con2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(depthwise_con2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); + // USE_LITE_KERNEL(feed, kARM, kAny, kAny, def); // USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def); #endif // LITE_WITH_ARM diff --git a/paddle/fluid/lite/api/light_api.h b/paddle/fluid/lite/api/light_api.h index a43755c87387e6af4d65f541cf1ba61828f3d2a5..474e5da78bd2cd201b17f9a223bd1a177861a532 100644 --- a/paddle/fluid/lite/api/light_api.h +++ b/paddle/fluid/lite/api/light_api.h @@ -72,8 +72,9 @@ class LightPredictor { // Create the kernels of the target places, and filter out the specific // kernel with the target alias. - for (auto& op : program.ops()) { - auto kernel_type = op->op_info()->GetAttr(kKernelTypeAttr); + for (auto& op : program.ops_) { + lite::pb::OpDesc desc(op->op_info()->desc()); + auto kernel_type = desc.GetAttr(kKernelTypeAttr).get(); std::string op_type, alias; Place place; KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); @@ -88,8 +89,8 @@ class LightPredictor { insts.emplace_back(op, std::move(*it)); } program_.reset(new RuntimeProgram(std::move(insts))); - CHECK(program.exec_scope()); - program_->set_exec_scope(program.exec_scope()); + CHECK(program.exec_scope_); + program_->set_exec_scope(program.exec_scope_); } private: diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt index 8abd04b52338299f75399903aa68fe834ce81d04..1980267380d4ed32f7530ef62861119c9094f015 100644 --- a/paddle/fluid/lite/arm/CMakeLists.txt +++ b/paddle/fluid/lite/arm/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(math) + diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt index 8af2c33943f7e2abe7e539b04e3759e8e2d4da33..17d1b7d9b2adc4f048b0e4056d435365f9410b53 100644 --- a/paddle/fluid/lite/arm/math/CMakeLists.txt +++ b/paddle/fluid/lite/arm/math/CMakeLists.txt @@ -6,4 +6,33 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)) return() endif() -cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc scale.cc elementwise.cc DEPS ${lite_kernel_deps} eigen3) +# TODO(xxx): seperate them +cc_library(math_arm SRCS + funcs.cc + packed_sgemm.cc + softmax.cc + scale.cc + pooling.cc + elementwise.cc + sgemv.cc + type_trans.cpp + conv_impl.cc + conv_direct_3x3s1.cc + conv_direct_3x3s2.cc + conv_direct.cc + conv_depthwise_3x3_int7.cc + conv_depthwise_3x3_int8.cc + conv_depthwise_5x5s1_int8.cc + conv_depthwise_3x3p0.cc + conv_depthwise_3x3p1.cc + conv_depthwise_5x5s1.cc + conv_depthwise_5x5s2.cc + conv_depthwise.cc + conv_gemmlike.cc + conv_winograd_3x3.cc + conv_winograd.cc + split.cc + DEPS ${lite_kernel_deps} eigen3 framework_proto_lite) + # TODO(TJ): fix me do not deps proto + + diff --git a/paddle/fluid/lite/arm/math/pooling.cc b/paddle/fluid/lite/arm/math/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc916d0f37c14fa0fcbed1dc74dc8a0964bac05e --- /dev/null +++ b/paddle/fluid/lite/arm/math/pooling.cc @@ -0,0 +1,3347 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/arm/math/pooling.h" +#include +#include +#include "paddle/fluid/lite/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void pooling_basic(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + // no need to pad input tensor, border is zero pad inside this function + int kernel_h = ksize[0]; + int kernel_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + int size_channel_in = win * hin; + int size_channel_out = wout * hout; + + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + if (global_pooling) { + if (pooling_type == "max") { // Pooling_max + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; ++c) { + const float* data_in_channel = + data_in_batch + c * size_channel_in; // in address + data_out_batch[c] = data_in_channel[0]; + for (int i = 0; i < size_channel_in; ++i) { + data_out_batch[c] = data_out_batch[c] > data_in_channel[i] + ? data_out_batch[c] + : data_in_channel[i]; + } + } + } + + } else if (pooling_type == "avg") { + // Pooling_average_include_padding + // Pooling_average_exclude_padding + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; ++c) { + const float* data_in_channel = + data_in_batch + c * size_channel_in; // in address + float sum = 0.f; + for (int i = 0; i < size_channel_in; ++i) { + sum += data_in_channel[i]; + } + data_out_batch[c] = sum / size_channel_in; + } + } + } else { + LOG(FATAL) << "not support"; + } + return; + } + + if (pooling_type == "max") { + // Pooling_max + for (int n = 0; n < num; ++n) { + float* data_out_channel = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int q = 0; q < chout; q++) { + float* data_out_row = data_out_channel + q * size_channel_out; + const float* data_in_channel = data_in_batch + q * size_channel_in; + + for (int i = 0; i < hout; i++) { + for (int j = 0; j < wout; j++) { + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, hin + pad_h); + int wend = std::min(wstart + kernel_w, win + pad_w); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, hin); + wend = std::min(wend, win); + + data_out_row[j] = data_in_channel[hstart * win + wstart]; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + data_out_row[j] = data_out_row[j] > data_in_channel[h * win + w] + ? data_out_row[j] + : data_in_channel[h * win + w]; + } + } + } + data_out_row += wout; + } + } + } + } else if (pooling_type == "avg") { + if (exclusive == false) { + // Pooling_average_include_padding + for (int n = 0; n < num; ++n) { + int pool_size = + kernel_w * + kernel_h; // (hend - hstart) * (wend - wstart); // problem + float* data_out_channel = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int q = 0; q < chout; q++) { + float* data_out_row = data_out_channel + q * size_channel_out; + const float* data_in_channel = data_in_batch + q * size_channel_in; + for (int i = 0; i < hout; i++) { + for (int j = 0; j < wout; j++) { + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, hin + pad_h); + int wend = std::min(wstart + kernel_w, win + pad_w); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, hin); + wend = std::min(wend, win); + + int bh = kernel_h; + int bw = kernel_w; + if (wend == win) { + bw = wstart + kernel_w >= win + pad_w ? win + pad_w + : wstart + kernel_w; + bw -= wstart; + } + if (hend == hin) { + bh = hstart + kernel_h >= hin + pad_h ? hin + pad_h + : hstart + kernel_h; + bh -= hstart; + } + pool_size = bh * bw; + + data_out_row[j] = data_in_channel[hstart * win + wstart]; + float sum = 0.f; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += data_in_channel[h * win + w]; + } + } + data_out_row[j] = sum / pool_size; + } + data_out_row += wout; + } + } + } + } else { // exclusive == true, Pooling_average_exclude_padding + for (int n = 0; n < num; ++n) { + float* data_out_channel = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int q = 0; q < chout; q++) { + float* data_out_row = data_out_channel + q * size_channel_out; + const float* data_in_channel = data_in_batch + q * size_channel_in; + for (int i = 0; i < hout; i++) { + for (int j = 0; j < wout; j++) { + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, hin + pad_h); + int wend = std::min(wstart + kernel_w, win + pad_w); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, hin); + wend = std::min(wend, win); + + data_out_row[j] = data_in_channel[hstart * win + wstart]; + float sum = 0.f; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += data_in_channel[h * win + w]; + } + } + int pool_size = (hend - hstart) * (wend - wstart); + data_out_row[j] = sum / pool_size; + } + data_out_row += wout; + } + } + } + } + + } else { + LOG(FATAL) << "not support"; + } +} + +void pooling_global(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int size_channel_in = win * hin; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int cnt = size_channel_in / 8; + +#if 0 + LOG(INFO) << "size_channel_in:" << size_channel_in; + LOG(INFO) << "cnt:" << cnt; + LOG(INFO) << "num:" << num; + LOG(INFO) << "chout:" << chout; + LOG(INFO) << "hout:" << hout; + LOG(INFO) << "wout:" << wout; + + LOG(INFO) << "chin:" << chin; + LOG(INFO) << "hin:" << hin; + LOG(INFO) << "win:" << win; + LOG(INFO) << "pooling_type " << pooling_type; +#endif + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout; + const float* data_in_batch = data_in + n * chin * size_channel_in; + if (pooling_type == "max") { +#pragma omp parallel for + for (int c = 0; c < chout; ++c) { + const float* data_in_channel = data_in_batch + c * size_channel_in; + int i = 0; + float minval = std::numeric_limits::lowest(); + float32x4_t vmax = vdupq_n_f32(minval); +#ifdef __aarch64__ + for (; i < cnt; i++) { + float32x4_t vdin1 = vld1q_f32(data_in_channel); + vmax = vmaxq_f32(vdin1, vmax); + float32x4_t vdin2 = vld1q_f32(data_in_channel + 4); + vmax = vmaxq_f32(vmax, vdin2); + data_in_channel += 8; + } +#else + int num = cnt; + if (num > 0) { + asm volatile( + "max_loop: @main loop\n" + "vld1.f32 {d0-d1}, [%[data_in_channel]]! @load q1, " + "data_in_channel\n" + "vmax.f32 %q[vmax], %q[vmax], q0 @max vmax, " + "vmax, data_in_channel\n" + "vld1.f32 {d2-d3}, [%[data_in_channel]]! @ load 2nd 4 " + "data" + "vmax.f32 %q[vmax], %q[vmax], q1 @ compare 2nd " + "4 datas\n" + "subs %[num], #1 @subs num, 1\n" + "bne max_loop @bne num\n" + : [data_in_channel] "+r"(data_in_channel), [num] "+r"(num), + [vmax] "+w"(vmax) + : + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ + float32x2_t vmax_tmp = + vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax)); + float tmp1 = vget_lane_f32(vmax_tmp, 0); + float tmp2 = vget_lane_f32(vmax_tmp, 1); + float max_tmp = tmp1 > tmp2 ? tmp1 : tmp2; + for (i = cnt * 8; i < size_channel_in; ++i) { + /* code */ + max_tmp = max_tmp > data_in_channel[0] ? max_tmp : data_in_channel[0]; + data_in_channel++; + } + data_out_batch[c] = max_tmp; + } + } else { +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + const float* data_in_channel = + data_in_batch + c * size_channel_in; // in address + int i = 0; + float32x4_t vsum = vdupq_n_f32(0.0f); +#ifdef __aarch64__ + for (; i < cnt; i++) { // + vsum = vaddq_f32(vld1q_f32(data_in_channel), vsum); + data_in_channel += 4; + } +#else + int num = cnt; + if (num > 0) { + asm volatile( + "add_loop: @main loop\n" + "vld1.f32 {d0-d1}, [%[data_in_channel]]! @load q1, " + "data_in_channel\n" + "vadd.f32 %q[vsum], %q[vsum], q0 @add vmax, " + "vmax, data_in_channel\n" + "subs %[num], #1 @subs num, 1\n" + "bne add_loop @bne num\n" + : [data_in_channel] "+r"(data_in_channel), [num] "+r"(num), + [vsum] "+w"(vsum) + : + : "cc", "memory", "q0"); + } +#endif // __aarch64__ + float32x2_t vsum_tmp = + vadd_f32(vget_low_f32(vsum), vget_high_f32(vsum)); + float sum = vget_lane_f32(vsum_tmp, 0) + vget_lane_f32(vsum_tmp, 1); + for (i = cnt * 4; i < size_channel_in; i++) { + sum += data_in_channel[0]; + data_in_channel++; + } + data_out_batch[c] = sum / size_channel_in; + } + } + } +} + +void pooling2x2s2_max(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int w_even = (win >> 1) << 1; + // int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (hin >> 1) << 1; + // int h_remains = h_in - h_even; // should be 0 or 1 + int w_unroll_size = (w_even >> 3) << 3; + // int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = win << 1; + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + int h = 0; + for (; h < h_even; h += 2) { + int w = 0; +#ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); + float32x4_t dr10 = vld1q_f32(&r1[w]); + float32x4_t dr11 = vld1q_f32(&r1[w + 4]); + float32x4_t dmax1 = vmaxq_f32(dr00, dr10); + float32x4_t dmax2 = vmaxq_f32(dr01, dr11); +#ifdef __aarch64__ + float32x4_t dmax = vpmaxq_f32(dmax1, dmax2); +#else + float32x2_t dmaxl = + vpmax_f32(vget_low_f32(dmax1), vget_high_f32(dmax1)); + float32x2_t dmaxh = + vpmax_f32(vget_low_f32(dmax2), vget_high_f32(dmax2)); + float32x4_t dmax = vcombine_f32(dmaxl, dmaxh); +#endif + vst1q_f32(&data_out_channel[w >> 1], dmax); + } +#else + w = w_unroll_size; + int num = w_unroll_size >> 3; + const float* dr0 = r0; + const float* dr1 = r1; + float* dr_out = data_out_channel; + if (num > 0) { + asm volatile( + "s2_max_loop: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load q0, dr0\n" + "vld1.f32 {d4-d7}, [%[dr1]]! @load q1, dr1\n" + "vmax.f32 q0, q0, q2 @max q0, q0, " + "q2\n" + "vmax.f32 q1, q1, q3 @max q1, q1, " + "q2\n" + "vpmax.f32 d4, d0, d1 @max d4, d0, " + "d1\n" + "vpmax.f32 d5, d2, d3 @max d5, d2, " + "d3\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2, " + "dr_out\n" + "subs %[num], #1 @subs num, 1\n" + "bne s2_max_loop @bne num\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [num] "+r"(num) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = + std::max(std::max(r0[w], r0[w + 1]), std::max(r1[w], r1[w + 1])); + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = std::max(r0[w], r1[w]); + } + r0 += w_in_2; // << 1; + r1 += w_in_2; // << 1; + data_out_channel += wout; + } + // process remain row (odd, last row) + for (; h < hin; h++) { // run 0 or 1 time + int w = 0; +#ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); +#ifdef __aarch64__ + float32x4_t dmax = vpmaxq_f32(dr00, dr01); +#else + float32x2_t dmaxl = + vpmax_f32(vget_low_f32(dr00), vget_high_f32(dr00)); + float32x2_t dmaxh = + vpmax_f32(vget_low_f32(dr01), vget_high_f32(dr01)); + float32x4_t dmax = vcombine_f32(dmaxl, dmaxh); +#endif + float32x4_t dmax_cmp_zero = vmaxq_f32(dmax, vzero); + vst1q_f32(&data_out_channel[w >> 1], dmax_cmp_zero); + } +#else + w = w_unroll_size; + int num = w_unroll_size >> 3; + const float* dr0 = r0; + float* dr_out = data_out_channel; + if (num > 0) { + asm volatile( + "s2_max_loop1: @main " + "loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load q0, dr0\n" + "vpmax.f32 d4, d0, d1 @max d4, d0, " + "d1\n" + "vpmax.f32 d5, d2, d3 @max d5, d2, " + "d3\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2, " + "dr_out\n" + "subs %[num], #1 @subs num, 1\n" + "bne s2_max_loop1 @bne num\n" + : [dr0] "+r"(dr0), [dr_out] "+r"(dr_out), [num] "+r"(num) + : + : "cc", "memory", "q0", "q1", "q2"); + } +#endif // __aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = std::max(std::max(r0[w], r0[w + 1]), 0.f); + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = std::max(r0[w], 0.f); + } + } + } + } +} + +void pooling2x2s2_ave(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int w_even = (win >> 1) << 1; + // int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (hin >> 1) << 1; + // int h_remains = h_in - h_even; // should be 0 or 1 + int w_unroll_size = (w_even >> 3) << 3; + // int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = win << 1; + float32x4_t vcoef = vdupq_n_f32(0.25f); // divided by 4 + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + int h = 0; + for (; h < h_even; h += 2) { + int w = 0; +#ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); + float32x4_t dr10 = vld1q_f32(&r1[w]); + float32x4_t dr11 = vld1q_f32(&r1[w + 4]); + float32x4_t dsum1 = vaddq_f32(dr00, dr10); + float32x4_t dsum2 = vaddq_f32(dr01, dr11); +#ifdef __aarch64__ + float32x4_t dsum = vpaddq_f32(dsum1, dsum2); +#else + float32x2_t dsuml = + vpadd_f32(vget_low_f32(dsum1), vget_high_f32(dsum1)); + float32x2_t dsumh = + vpadd_f32(vget_low_f32(dsum2), vget_high_f32(dsum2)); + float32x4_t dsum = vcombine_f32(dsuml, dsumh); +#endif + float32x4_t res = vmulq_f32(dsum, vcoef); + vst1q_f32(&data_out_channel[w >> 1], res); + } +#else + w = w_unroll_size; + int num = w_unroll_size >> 3; + const float* dr0 = r0; + const float* dr1 = r1; + float* dr_out = data_out_channel; + + if (num > 0) { + asm volatile( + "1: @ main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @ load q0, " + "dr0\n" + "vld1.f32 {d4-d7}, [%[dr1]]! @ load q1, " + "dr1\n" + "vadd.f32 q0, q0, q2 @ add q0, q0, " + "q2\n" + "vadd.f32 q1, q1, q3 @ add q1, q1, " + "q2\n" + "vpadd.f32 d4, d0, d1 @ add d4, d0, " + "d1\n" + "vpadd.f32 d5, d2, d3 @ add d5, d2, " + "d3\n" + "vmul.f32 q2, q2, %q[vcoef] @ mul q2, q2, " + "vcoef\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @ vst1 q2, " + "dr_out\n" + "subs %[num], #1 @ subs num, 1\n" + "bne 1b @ bne num\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [vcoef] "+w"(vcoef), [num] "+r"(num) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(num), "w"(vcoef) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = + (r0[w] + r0[w + 1] + r1[w] + r1[w + 1]) / 4.f; + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = (r0[w] + r1[w]) / 4.f; + } + r0 += w_in_2; // << 1; + r1 += w_in_2; // << 1; + data_out_channel += wout; + } + // process remain row (odd, last row) + for (; h < hin; h++) { // run 0 or 1 time + int w = 0; +#ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); +#ifdef __aarch64__ + float32x4_t dsum = vpaddq_f32(dr00, dr01); +#else + float32x2_t dsuml = + vpadd_f32(vget_low_f32(dr00), vget_high_f32(dr00)); + float32x2_t dsumh = + vpadd_f32(vget_low_f32(dr01), vget_high_f32(dr01)); + float32x4_t dsum = vcombine_f32(dsuml, dsumh); +#endif + float32x4_t res = vmulq_f32(dsum, vcoef); + vst1q_f32(&data_out_channel[w >> 1], res); + } +#else + w = w_unroll_size; + int num = w_unroll_size >> 3; + const float* dr0 = r0; + float* dr_out = data_out_channel; + + if (num > 0) { + asm volatile( + "1: @ main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @ load q0, " + "dr0\n" + "vpadd.f32 d4, d0, d1 @ add d4, d0, " + "d1\n" + "vpadd.f32 d5, d2, d3 @ add d5, d2, " + "d3\n" + "vmul.f32 q2, q2, %q[vcoef] @ mul q2, q2, " + "vcoef\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @ vst1 q2, " + "dr_out\n" + "subs %[num], #1 @ subs num, 1\n" + "bne 1b @ bne num\n" + : [dr0] "+r"(dr0), [dr_out] "+r"(dr_out), [vcoef] "+w"(vcoef), + [num] "+r"(num) + : "r"(dr0), "r"(dr_out), "r"(num), "w"(vcoef) + : "cc", "memory", "q0", "q1", "q2"); + } +#endif // __aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = (r0[w] + r0[w + 1]) / 4.f; + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = r0[w] / 4.f; + } + } + } + } +} + +void pooling3x3s1p1_max(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + // no need to pad input tensor, pad_size is not used, default border is zero + // padded + int ch_in = chin; + int h_in = hin; + int w_in = win; + + int ch_out = chout; + int h_out = hout; + int w_out = wout; + + int size_channel_out = w_out * h_out; + int size_channel_in = win * hin; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int w_even = (w_in >> 1) << 1; + // int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (h_in >> 1) << 1; + // int h_remains = h_in - h_even; // should be 0 or 1 + // int w_unroll_size = (w_even >> 3) << 3; + // int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = w_in << 1; + int w_unroll_size = (w_in - 2) >> 2; + int w_unroll_remian = w_in - 2 - w_unroll_size * 4; + float minval = std::numeric_limits::lowest(); + float32x4_t vzero = vdupq_n_f32(minval); // zero pad + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * ch_out * size_channel_out; + const float* data_in_batch = data_in + n * ch_in * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < ch_out; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + w_in; + const float* r2 = r1 + w_in; + int cnt_num = w_unroll_size; // w_in / 4 + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 0; + int cnt = 1; + // left + data_out_channel[0] = + std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); +// first row with zero pad +#ifdef __aarch64__ + for (; w <= w_in - 6; w += 4) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_34_56 = + vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56); + float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0)); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3); + vst1q_f32(&data_out_channel[cnt], vmax); + cnt += 4; + } + +#else + dr_out = dr_out + 1; + + if (cnt_num > 0) { + asm volatile( + "1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d2}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6}, [%[dr1]]! @load d4-d7, dr1\n" + "vmax.f32 q5, q0, q2 @max " + "r0_1234,r1_1234\n" + "vmax.f32 d12, d2, d6 @max " + "r0_5678,r1_5678\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q5, q6, #1 @vext max_2345\n" + "vext.f32 q2, q5, q6, #2 @vext max_3456\n" + "vpmax.f32 d2, d10, d11 @pmax d4, " + "max_1234, max_1234\n" + "vpmax.f32 d3, d0, d1 @pmax d4, " + "max_2345, max_2345\n" + "vpmax.f32 d6, d4, d5 @pmax d6, " + "max_3456, max_3456\n" + "vmax.f32 d8, d2, d3 @max d2, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d3, d6 @max d2, " + "vmax_23_45, vmax_34_56\n" + "sub %[dr0], #8 @sub w, 8\n" + "sub %[dr1], #8 @sub w, 8\n" + // swap + "vmov.f32 s0, s17 @mov \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s0 @mov \n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne 1b @bne s1_max_loop\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); + } + +#endif + // remian + w = w_unroll_size * 4; + for (int j = 0; j < w_unroll_remian; j++) { + float tmp_max = std::max(r0[j + w], r1[j + w]); + tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1])); + tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2])); + data_out_channel[j + w + 1] = tmp_max; + } + // right + float tmp = std::max(r0[w_in - 2], r1[w_in - 2]); + tmp = std::max(tmp, std::max(r0[w_in - 1], r1[w_in - 1])); + data_out_channel[w_out - 1] = tmp; + + // r0 = r1; + // r1 = r0 + w_in; + // r2 = r1 + w_in; + data_out_channel += w_out; + int h = 0; + for (; h < h_in - 2; h += 1) { + // deal with left pad + float maxr0 = std::max(r0[0], r0[1]); + float maxr1 = std::max(r1[0], r1[1]); + float maxr2 = std::max(r2[0], r2[1]); + data_out_channel[0] = std::max(std::max(maxr0, maxr1), maxr2); +#ifdef __aarch64__ + w = 0; + cnt = 1; + for (; w <= w_in - 6; w += 4) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); + + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_34_56 = + vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56); + float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0)); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3); + vst1q_f32(&data_out_channel[cnt], vmax); + cnt += 4; + } +#else + dr_out = data_out_channel + 1; + dr0 = r0; + dr1 = r1; + dr2 = r2; + cnt_num = w_unroll_size; + if (cnt_num > 0) { + asm volatile( + "1: @main " + "loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d8-d9}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d2}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d10}, [%[dr2]]! @load d4-d7, dr1\n" + "vmax.f32 q7, q0, q2 @max " + "r0_1234,r1_1234\n" + "vmax.f32 d16, d2, d6 @max " + "r0_5678,r1_5678\n" + "vmax.f32 q3, q7, q4 @max " + "r0_1234,r1_1234\n" + "vmax.f32 d12, d16, d10 @max " + "r0_5678,r1_5678\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q3, q6, #1 @vext max_2345\n" + "vext.f32 q2, q3, q6, #2 @vext max_3456\n" + "vpmax.f32 d2, d6, d7 @pmax d4, " + "max_1234, max_1234\n" + "vpmax.f32 d3, d0, d1 @pmax d4, " + "max_2345, max_2345\n" + "vpmax.f32 d6, d4, d5 @pmax d6, " + "max_3456, max_3456\n" + "vmax.f32 d8, d2, d3 @max d2, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d3, d6 @max d2, " + "vmax_23_45, vmax_34_56\n" + "sub %[dr0], #8 @sub w, 8\n" + "sub %[dr1], #8 @sub w, 8\n" + "sub %[dr2], #8 @sub w, 8\n" + // swap + "vmov.f32 s0, s17 @mov \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s0 @mov \n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne 1b @ bne " + "s1_max_loop\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8"); + } +#endif + // remian + w = w_unroll_size * 4; + for (int j = 0; j < w_unroll_remian; j++) { + float tmp_max = std::max(r0[j + w], r1[j + w]); + tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1])); + tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2])); + tmp_max = std::max(tmp_max, std::max(r2[j + w], r2[j + w + 1])); + tmp_max = std::max(tmp_max, r2[j + w + 2]); + data_out_channel[j + w + 1] = tmp_max; + } + // right + tmp = std::max(r0[w_in - 2], r1[w_in - 2]); + tmp = std::max(tmp, std::max(r0[w_in - 1], r1[w_in - 1])); + tmp = std::max(tmp, std::max(r2[w_in - 2], r2[w_in - 1])); + data_out_channel[w_out - 1] = tmp; + + r0 = r1; + r1 = r2; + r2 = r1 + w_in; + data_out_channel += w_out; + } + + // the last two line + float maxr0 = std::max(r0[0], r0[1]); + float maxr1 = std::max(r1[0], r1[1]); + data_out_channel[0] = std::max(maxr0, maxr1); +#ifdef __aarch64__ + w = 0; + cnt = 1; + for (; w <= w_in - 6; w += 4) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_34_56 = + vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56); + float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0)); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2); + vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3); + vst1q_f32(&data_out_channel[cnt], vmax); + cnt += 4; + } +#else + dr_out = data_out_channel + 1; + dr0 = r0; + dr1 = r1; + cnt_num = w_unroll_size; + if (cnt_num > 0) { + asm volatile( + "1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d2}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6}, [%[dr1]]! @load d4-d7, dr1\n" + "vmax.f32 q5, q0, q2 @max " + "r0_1234,r1_1234\n" + "vmax.f32 d12, d2, d6 @max " + "r0_5678,r1_5678\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q5, q6, #1 @vext max_2345\n" + "vext.f32 q2, q5, q6, #2 @vext max_3456\n" + "vpmax.f32 d2, d10, d11 @pmax d4, " + "max_1234, max_1234\n" + "vpmax.f32 d3, d0, d1 @pmax d4, " + "max_2345, max_2345\n" + "vpmax.f32 d6, d4, d5 @pmax d6, " + "max_3456, max_3456\n" + "vmax.f32 d8, d2, d3 @max d2, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d3, d6 @max d2, " + "vmax_23_45, vmax_34_56\n" + "sub %[dr0], #8 @sub w, 8\n" + "sub %[dr1], #8 @sub w, 8\n" + // swap + "vmov.f32 s0, s17 @mov \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s0 @mov \n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne 1b @bne s1_max_loop\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); + } +#endif + // remian + w = w_unroll_size * 4; + for (int j = 0; j < w_unroll_remian; j++) { + float tmp_max = std::max(r0[j + w], r1[j + w]); + tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1])); + tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2])); + data_out_channel[j + w + 1] = tmp_max; + } + tmp = std::max(r0[w_in - 2], r1[w_in - 2]); + tmp = std::max(tmp, std::max(r0[w_in - 1], r1[w_in - 1])); + data_out_channel[w_out - 1] = tmp; + } + } +} + +void pooling3x3s1p1_ave(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int w_in = win; + int h_in = hin; + int ch_in = chin; + + int w_out = wout; + int h_out = hout; + int ch_out = chout; + + int size_channel_out = w_out * h_out; + int size_channel_in = w_in * h_in; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int w_even = (w_in >> 1) << 1; + int h_even = (h_in >> 1) << 1; + int w_in_2 = w_in << 1; + int w_unroll_size = (w_in - 2) >> 2; + int w_unroll_remian = w_in - 2 - w_unroll_size * 4; + float32x4_t vzero = vdupq_n_f32(0.f); // zero pad + float32x4_t vcoef = vdupq_n_f32(1.f / 9.f); // zero pad + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * ch_out * size_channel_out; + const float* data_in_batch = data_in + n * ch_in * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < ch_out; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + w_in; + const float* r2 = r1 + w_in; + int cnt_num = w_unroll_size; // w_in / 4 + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 0; + int cnt = 1; + // left + data_out_channel[0] = (r0[0] + r0[1] + r1[0] + r1[1]) / 9.f; +// first row with zero pad +#ifdef __aarch64__ + for (; w <= w_in - 6; w += 4) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345); + vsum = vaddq_f32(vsum, vsum_3456); + vsum = vmulq_f32(vsum, vcoef); + vst1q_f32(&data_out_channel[cnt], vsum); + cnt += 4; + } + +#else + dr_out = dr_out + 1; + + if (cnt_num > 0) { + asm volatile( + "1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d2}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6}, [%[dr1]]! @load d4-d7, dr1\n" + "vadd.f32 q5, q0, q2 @max " + "r0_1234,r1_1234\n" + "vadd.f32 d12, d2, d6 @max " + "r0_5678,r1_5678\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q5, q6, #1 @vext max_2345\n" + "vext.f32 q2, q5, q6, #2 @vext max_3456\n" + "vadd.f32 q1, q5, q0 @add 1234 + 2345\n" + "vadd.f32 q1, q1, q2 @add + 3456\n" + "vmul.f32 q4, q1, %q[vcoef] @mul * 1/9.f \n" + "sub %[dr0], #8 @sub w, 8\n" + "sub %[dr1], #8 @sub w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne 1b @bne s1_max_loop\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [vcoef] "+w"(vcoef) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); + } + +#endif + // remian + w = w_unroll_size * 4; + for (int j = 0; j < w_unroll_remian; j++) { + float tmp_sum = r0[j + w] + r1[j + w]; + tmp_sum += (r0[j + w + 1] + r1[j + w + 1]); + tmp_sum += (r0[j + w + 2] + r1[j + w + 2]); + data_out_channel[j + w + 1] = tmp_sum / 9.f; + } + // right + float tmp = r0[w_in - 2] + r1[w_in - 2]; + tmp += (r0[w_in - 1] + r1[w_in - 1]); + data_out_channel[w_out - 1] = tmp / 9.f; + + // r0 = r1; + // r1 = r0 + w_in; + // r2 = r1 + w_in; + data_out_channel += w_out; + int h = 0; + for (; h < h_in - 2; h += 1) { + // deal with left pad + float maxr0 = r0[0] + r0[1]; + float maxr1 = r1[0] + r1[1]; + float maxr2 = r2[0] + r2[1]; + data_out_channel[0] = (maxr0 + maxr1 + maxr2) / 9.f; +#ifdef __aarch64__ + w = 0; + cnt = 1; + for (; w <= w_in - 6; w += 4) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + vsum_1234 = vaddq_f32(vsum_1234, vr2_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + vsum_5678 = vaddq_f32(vsum_5678, vr2_5678); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345); + vsum = vaddq_f32(vsum, vsum_3456); + vsum = vmulq_f32(vsum, vcoef); + vst1q_f32(&data_out_channel[cnt], vsum); + cnt += 4; + } +#else + dr_out = data_out_channel + 1; + dr0 = r0; + dr1 = r1; + dr2 = r2; + cnt_num = w_unroll_size; + if (cnt_num > 0) { + asm volatile( + "1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d8-d9}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d2}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d10}, [%[dr2]]! @load d4-d7, dr1\n" + "vadd.f32 q7, q0, q2 @max " + "r0_1234,r1_1234\n" + "vadd.f32 d16, d2, d6 @max " + "r0_5678,r1_5678\n" + "vadd.f32 q3, q7, q4 @max " + "r0_1234,r1_1234\n" + "vadd.f32 d12, d16, d10 @max " + "r0_5678,r1_5678\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q3, q6, #1 @vext max_2345\n" + "vext.f32 q2, q3, q6, #2 @vext max_3456\n" + "vadd.f32 q1, q3, q0 @add 1234 + " + "2345\n" + "vadd.f32 q1, q1, q2 @add + 3456\n" + "vmul.f32 q4, q1, %q[vcoef] @mul * 1/9.f \n" + "sub %[dr0], #8 @sub w, 8\n" + "sub %[dr1], #8 @sub w, 8\n" + "sub %[dr2], #8 @sub w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne 1b @bne " + "s1_max_loop\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num), + [vcoef] "+w"(vcoef) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8"); + } +#endif + // remian + w = w_unroll_size * 4; + for (int j = 0; j < w_unroll_remian; j++) { + float tmp_sum = r0[j + w] + r1[j + w]; + tmp_sum += (r0[j + w + 1] + r1[j + w + 1]); + tmp_sum += (r0[j + w + 2] + r1[j + w + 2]); + tmp_sum += (r2[j + w + 1] + r2[j + w + 2]); + tmp_sum += r2[j + w]; + data_out_channel[j + w + 1] = tmp_sum / 9.f; + } + // right + tmp = r0[w_in - 2] + r1[w_in - 2]; + tmp += (r0[w_in - 1] + r1[w_in - 1]); + tmp += (r2[w_in - 2] + r2[w_in - 1]); + data_out_channel[w_out - 1] = tmp / 9.f; + + r0 = r1; + r1 = r2; + r2 = r1 + w_in; + data_out_channel += w_out; + } + + // the last two line + float maxr0 = (r0[0] + r0[1]); + float maxr1 = (r1[0] + r1[1]); + data_out_channel[0] = (maxr0 + maxr1) / 9.f; +#ifdef __aarch64__ + w = 0; + cnt = 1; + for (; w <= w_in - 6; w += 4) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345); + vsum = vaddq_f32(vsum, vsum_3456); + vsum = vmulq_f32(vsum, vcoef); + vst1q_f32(&data_out_channel[cnt], vsum); + cnt += 4; + } +#else + dr_out = data_out_channel + 1; + dr0 = r0; + dr1 = r1; + cnt_num = w_unroll_size; + if (cnt_num > 0) { + asm volatile( + "1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d2}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6}, [%[dr1]]! @load d4-d7, dr1\n" + "vadd.f32 q5, q0, q2 @max " + "r0_1234,r1_1234\n" + "vadd.f32 d12, d2, d6 @max " + "r0_5678,r1_5678\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q5, q6, #1 @vext max_2345\n" + "vext.f32 q2, q5, q6, #2 @vext max_3456\n" + "vadd.f32 q1, q5, q0 @add 1234 + 2345\n" + "vadd.f32 q1, q1, q2 @add + 3456\n" + "vmul.f32 q4, q1, %q[vcoef] @mul * 1/9.f \n" + "sub %[dr0], #8 @sub w, 8\n" + "sub %[dr1], #8 @sub w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne 1b @bne s1_max_loop\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [vcoef] "+w"(vcoef) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); + } +#endif + // remian + w = w_unroll_size * 4; + for (int j = 0; j < w_unroll_remian; j++) { + float tmp_sum = r0[j + w] + r1[j + w]; + tmp_sum += (r0[j + w + 1] + r1[j + w + 1]); + tmp_sum += (r0[j + w + 2] + r1[j + w + 2]); + data_out_channel[j + w + 1] = tmp_sum / 9.f; + } + // right + tmp = r0[w_in - 2] + r1[w_in - 2]; + tmp += (r0[w_in - 1] + r1[w_in - 1]); + data_out_channel[w_out - 1] = tmp / 9.f; + } + } +} + +void pooling3x3s2p1_max(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int kernel_h = ksize[0]; + int kernel_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + + int pad_top = pad_h; + int pad_left = pad_w; + int w_needed = wout * 2 + 1; + int h_needed = hout * 2 + 1; + int pad_right = w_needed - win - pad_left; + int pad_bottom = h_needed - hin - pad_top; + int w_even = (win >> 1) << 1; + int h_even = (hin >> 1) << 1; + int w_in_2 = win << 1; + float minval = std::numeric_limits::lowest(); + float32x4_t vzero = vdupq_n_f32(minval); // zero pad + int cnt_col = (win - 1) / 8; + // remain + int remain = ((win - 1) % 8) / 2; + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + const float* r2 = r1 + win; + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 1; + int cnt = 1; + int cnt_num = cnt_col; + int cnt_num1 = remain; + data_out_channel[0] = + std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); +// first row with zero pad +#ifdef __aarch64__ + for (; w < win - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = + vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = + vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt], vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2], vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + float32x2_t vmax2 = + vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + vmax2 = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax2, 0); + cnt++; + } +#else + dr0 = dr0 + 1; + dr1 = dr1 + 1; + dr_out = dr_out + 1; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vmax.f32 q6, q0, q3 @max " + "r0_1234,r1_1234\n" + "vmax.f32 q7, q1, q4 @max " + "r0_5678,r1_5678\n" + "vmax.f32 q8, q2, q5 @max " + "r0_9101112,r1_9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q7, q8, #1 @vext max_6789\n" + "vpmax.f32 d4, d12, d13 @pmax d4, " + "vmax_1234, vmax_1234\n" + "vpmax.f32 d6, d14, d15 @pmax d6, " + "vmax_5678, vmax_5678\n" + "vpmax.f32 d5, d0, d1 @pmax d5, " + "vmax_2345, vmax_2345\n" + "vpmax.f32 d7, d2, d3 @pmax d7, " + "vmax_6789, vmax_6789\n" + "vmax.f32 d8, d4, d5 @max d2, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d6, d7 @max d2, " + "vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "bne 1b @bne s3_max_loop\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, " + "0\n" + "ble 4f @ble exit\n" + "2: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmax.f32 q0, q0, q1 @max q0, q0, q1\n" + "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "bne 2b @bne " + "s3_max_loop_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9"); + } +// printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); +#endif + // int w = w_even - 1; + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { // only run 1 or 2 times + tmp = std::max(tmp, std::max(r0[i], r1[i])); + } + data_out_channel[w_even >> 1] = tmp; + // cnt ++; + } + + r0 = r1; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + int h = 2; + for (; h < h_even; h += 2) { + // deal with left pad + float maxr0 = std::max(r0[0], r0[1]); + float maxr1 = std::max(r1[0], r1[1]); + float maxr2 = std::max(r2[0], r2[1]); + data_out_channel[0] = std::max(std::max(maxr0, maxr1), maxr2); +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < win - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = + vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = + vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt], vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2], vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + float32x4_t vr2 = vld1q_f32(&r2[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + vr2 = vsetq_lane_f32(minval, vr2, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + vmax1 = vmaxq_f32(vmax1, vr2); + float32x2_t vmax2 = + vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + float32x2_t vmax = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax, 0); + cnt++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + dr2 = (r2 + 1); + cnt_num = cnt_col; + cnt_num1 = remain; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d16-d17}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vmax.f32 q9, q0, q3 @max q0,q0,q2\n" + "vmax.f32 q10, q1, q4 @max q1,q1,q3\n" + "vmax.f32 q11, q2, q5 @max q1,q1,q3\n" + "vmax.f32 q0, q9, q6 @max q0,q0,q2 " + "1234\n" + "vmax.f32 q3, q10, q7 @max q1,q1,q3 " + "5678\n" + "vmax.f32 q1, q11, q8 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q4, q0, q3, #1 @vext 2345\n" + "vext.f32 q2, q3, q1, #1 @vext 6789\n" + "vpmax.f32 d10, d0, d1 @pmax d10, " + "vmax_1234, vmax_1234\n" + "vpmax.f32 d12, d6, d7 @pmax d12, " + "vmax_5678, vmax_5678\n" + "vpmax.f32 d11, d8, d9 @pmax d11, " + "vmax_2345, vmax_2345\n" + "vpmax.f32 d13, d4, d5 @pmax d13, " + "vmax_6789, vmax_6789\n" + "vmax.f32 d0, d10, d11 @pmax d0, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d1, d12, d13 @pmax d1, " + "vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "sub %[dr2], #16 @add w, 8\n" + "vst1.f32 d0, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d1, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "bne 1b @bne " + "s3_max_loop_mid\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit1\n" + "2: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, " + "dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmov.f32 s11,s10 @movs11, s10\n" + "vmax.f32 q0, q0, q1 @max q0, q0, " + "q1\n" + "vmax.f32 q0, q0, q2 @max q0, q0, " + "q2\n" + "vpmax.f32 d0, d0, d1 @pmax d0, " + "d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, " + "d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, " + "#1\n" + "bne 2b @bne " + "s3_max_loop_mid_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num), + [cnt_num1] "+r"(cnt_num1) + : "r"(dr0), "r"(dr1), "r"(dr2), "r"(dr_out), "r"(cnt_num), + "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { + tmp = std::max(tmp, std::max(r0[i], r1[i])); + tmp = std::max(tmp, r2[i]); + } + data_out_channel[w_even >> 1] = tmp; + // cnt ++; + } + r0 = r2; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + } + + if (pad_bottom) { + // deal with bottom pad + // first row with zero pad + int hstart = (h >> 1) * stride_h - pad_h; + int hend = std::min(std::min(hstart + kernel_h, hin + pad_h), hin); + + if (hstart == hend - 1) { // only one lline + data_out_channel[0] = std::max(r0[0], r0[1]); +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < win - 8; w += 8) { + float32x4_t vmax_1234 = vld1q_f32(&r0[w]); + float32x4_t vmax_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vmax_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = + vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = + vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt], vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2], vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + float32x2_t vmax = vpmax_f32(vget_low_f32(vr0), vget_high_f32(vr0)); + vmax = vpmax_f32(vmax, vmax); + data_out_channel[cnt] = vget_lane_f32(vmax, 0); + cnt++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + cnt_num = cnt_col; + cnt_num1 = remain; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d3, " + "dr0\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3, " + "dr0\n" + "vext.f32 q4, q0, q1, #1 @vext q4, q0, " + "q1, 1 2345\n" + "vext.f32 q5, q1, q2, #1 @vext q5, q0, " + "q1, 1 6789\n" + "vpmax.f32 d12, d0, d1 @pmax d12, " + "vmax_1234, vmax_1234\n" + "vpmax.f32 d14, d2, d3 @pmax d14, " + "vmax_5678, vmax_5678\n" + "vpmax.f32 d13, d8, d9 @pmax d13, " + "vmax_2345, vmax_2345\n" + "vpmax.f32 d15, d10, d11 @pmax d15, " + "vmax_6789, vmax_6789\n" + "vmax.f32 d0, d12, d13 @max d0, " + "vmax_12_34,vmax_23_45\n" + "vmax.f32 d1, d14, d15 @pmax d2, " + "vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 6\n" + "vst1.f32 d0, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d1, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "bne 1b @bne " + "s3_max_loop_bot\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit\n" + "2: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vpmax.f32 d0, d0, d1 @pmax d0, " + "d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, " + "d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "sub %[dr0], #8 @add w, 2\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "bne 2b @bne " + "s3_max_loop_bot_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { + tmp = std::max(tmp, r0[i]); + } + data_out_channel[w_even >> 1] = tmp; + } + } else { // two lines + data_out_channel[0] = + std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < win - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = + vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = + vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt], vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2], vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + float32x2_t vmax2 = + vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + vmax2 = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax2, 0); + cnt++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + cnt_num = cnt_col; + cnt_num1 = remain; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3, " + "dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vmax.f32 q6, q0, q3 @max q0,q0,q2 " + "1234\n" + "vmax.f32 q7, q1, q4 @max q1,q1,q3 " + "5678\n" + "vmax.f32 q8, q2, q5 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, + // s6\n" + "vext.f32 q0, q6, q7, #1 @vext q0, " + "2345\n" + "vext.f32 q1, q7, q8, #1 @vext q1, " + "6789\n" + "vpmax.f32 d4, d12, d13 @pmax d4, " + "vmax_1234, vmax_1234\n" + "vpmax.f32 d6, d14, d15 @pmax d6, " + "vmax_5678, vmax_5678\n" + "vpmax.f32 d5, d0, d1 @pmax d5, " + "vmax_2345, vmax_2345\n" + "vpmax.f32 d7, d2, d3 @pmax d7, " + "vmax_6789, vmax_6789\n" + "vmax.f32 d8, d4, d5 @max d2, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d6, d7 @max d2, " + "vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "bne 1b @bne " + "s3_max_loop_bot\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit\n" + "2: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmax.f32 q0, q0, q1 @max q0, q0, " + "q1\n" + "vpmax.f32 d0, d0, d1 @pmax d0, " + "d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, " + "d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "bne 2b @bne " + "s3_max_loop_bot_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { // only run 1 or 2 times + tmp = std::max(tmp, std::max(r0[i], r1[i])); + } + data_out_channel[w_even >> 1] = tmp; + } + } + } + } + } +} + +void pooling3x3s2p1_ave(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int kernel_h = ksize[0]; + int kernel_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + + int pad_top = pad_h; + int pad_left = pad_w; + int w_needed = wout * 2 + 1; + int h_needed = hout * 2 + 1; + int pad_right = w_needed - win - pad_left; + int pad_bottom = h_needed - hin - pad_top; + int w_even = (win >> 1) << 1; + int h_even = (hin >> 1) << 1; + int w_in_2 = win << 1; + int w_unroll_size = (win - 1) / 8; + // remain + int w_unroll_remian = ((win - 1) % 8) / 2; + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + const float* r2 = r1 + win; + int cnt_num = w_unroll_size; + int cnt_num1 = w_unroll_remian; + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 1; + int cnt = 1; + float32x4_t vcoef = vdupq_n_f32(1.f / 9.f); + float32x4_t vzero = vdupq_n_f32(0.f); + data_out_channel[0] = (r0[0] + r0[1] + r1[0] + r1[1]) / 9.f; +// first row with zero pad +#ifdef __aarch64__ + for (; w < win - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt], vrst); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + float32x2_t vsum2 = + vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + vsum2 = vpadd_f32(vsum2, vsum2); + float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef)); + data_out_channel[cnt] = vget_lane_f32(vrst, 0); + cnt++; + } +#else + dr0 = dr0 + 1; + dr1 = dr1 + 1; + dr_out = dr_out + 1; + // printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vadd.f32 q6, q0, q3 @max " + "r0_1234,r1_1234\n" + "vadd.f32 q7, q1, q4 @max " + "r0_5678,r1_5678\n" + "vadd.f32 q8, q2, q5 @max " + "r0_9101112,r1_9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne 1b @bne s3_max_loop\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, " + "0\n" + "ble 4f @ble exit\n" + "2: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "bne 2b @bne s3_max_loop_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1), + [vcoef] "+w"(vcoef), [vzero] "+w"(vzero) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9"); + } +// printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); +#endif + // int w = w_even - 1; + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { // only run 1 or 2 times + tmp += (r0[i] + r1[i]); + } + data_out_channel[w_even >> 1] = tmp / 9.f; + // cnt ++; + } + + r0 = r1; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + int h = 2; + for (; h < h_even; h += 2) { + // deal with left pad + float sum0 = r0[0] + r0[1]; + float sum1 = r1[0] + r1[1]; + float sum2 = r2[0] + r2[1]; + data_out_channel[0] = (sum0 + sum1 + sum2) / 9.f; +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < win - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + vsum_1234 = vaddq_f32(vsum_1234, vr2_1234); + vsum_5678 = vaddq_f32(vsum_5678, vr2_5678); + vsum_9101112 = vaddq_f32(vsum_9101112, vr2_9101112); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt], vrst); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + float32x4_t vr2 = vld1q_f32(&r2[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + vr2 = vsetq_lane_f32(0.f, vr2, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + vsum1 = vaddq_f32(vsum1, vr2); + float32x2_t vsum2 = + vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + float32x2_t vsum = vpadd_f32(vsum2, vsum2); + data_out_channel[cnt] = vget_lane_f32(vsum, 0) / 9.f; + cnt++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + dr2 = (r2 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d16-d17}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vadd.f32 q9, q0, q3 @max q0,q0,q2\n" + "vadd.f32 q10, q1, q4 @max q1,q1,q3\n" + "vadd.f32 q11, q2, q5 @max q1,q1,q3\n" + "vadd.f32 q6, q9, q6 @max q0,q0,q2 " + "1234\n" + "vadd.f32 q7, q10, q7 @max q1,q1,q3 " + "5678\n" + "vadd.f32 q8, q11, q8 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 " + "\n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 " + "\n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 " + "\n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 " + "\n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "sub %[dr2], #16 @add w, 8\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne 1b @bne s3_max_loop_mid\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit1\n" + "2: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, " + "dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vext.f32 q2, %q[vzero], q2, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, " + "q1\n" + "vadd.f32 q0, q0, q2 @add q0, q0, " + "q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, " + "d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, " + "d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "bne 2b @bne s3_max_loop_mid_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num), + [cnt_num1] "+r"(cnt_num1), [vcoef] "+w"(vcoef), + [vzero] "+w"(vzero) + : "r"(dr0), "r"(dr1), "r"(dr2), "r"(dr_out), "r"(cnt_num), + "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; + for (int i = wstart; i < wend; i++) { + tmp += (r0[i] + r1[i] + r2[i]); + } + data_out_channel[w_even >> 1] = tmp / 9.f; + // cnt ++; + } + r0 = r2; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + } + + if (pad_bottom) { + // deal with bottom pad + // first row with zero pad + int hstart = (h >> 1) * stride_h - pad_h; + int hend = std::min(std::min(hstart + kernel_h, hin + pad_h), hin); + + if (hstart == hend - 1) { // only one lline + data_out_channel[0] = (r0[0] + r0[1]) / 9.f; +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < win - 8; w += 8) { + float32x4_t vsum_1234 = vld1q_f32(&r0[w]); + float32x4_t vsum_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vsum_9101112 = vld1q_f32(&r0[w + 8]); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), + vsum_123_345, 1); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), + vsum_123_345, 2); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), + vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt], vrst); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + float32x2_t vsum = vpadd_f32(vget_low_f32(vr0), vget_high_f32(vr0)); + vsum = vpadd_f32(vsum, vsum); + data_out_channel[cnt] = vget_lane_f32(vsum, 0) / 9.f; + cnt++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d12-d15}, [%[dr0]]! @load " + "d0-d3, dr0\n" + "vld1.f32 {d16-d17}, [%[dr0]]! @load " + "d0-d3, dr0\n" + "vext.f32 q0, q6, q7, #1 @vext " + "max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext " + "max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext " + "max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext " + "max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, " + "2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, " + "4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, " + "sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, " + "sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 6\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne 1b @bne s3_max_loop_bot\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit\n" + "2: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext " + "v0_0123\n" + "vpadd.f32 d0, d0, d1 @padd d0, " + "d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, " + "d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 2\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "bne 2b @bne s3_max_loop_bot_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1), + [vcoef] "+w"(vcoef), [vzero] "+w"(vzero) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; + for (int i = wstart; i < wend; i++) { + tmp += r0[i]; + } + data_out_channel[w_even >> 1] = tmp / 9.f; + } + } else { // two lines + data_out_channel[0] = (r0[0] + r0[1] + r1[0] + r1[1]) / 9.f; +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < win - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), + vsum_123_345, 1); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), + vsum_123_345, 2); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), + vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt], vrst); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + float32x2_t vsum2 = + vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + vsum2 = vpadd_f32(vsum2, vsum2); + float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef)); + data_out_channel[cnt] = vget_lane_f32(vrst, 0); + cnt++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3, " + "dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vmax.f32 q6, q0, q3 @max q0,q0,q2 " + "1234\n" + "vmax.f32 q7, q1, q4 @max q1,q1,q3 " + "5678\n" + "vmax.f32 q8, q2, q5 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, + // s6\n" + "vext.f32 q0, q6, q7, #1 @vext " + "max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext " + "max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext " + "max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext " + "max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, " + "2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, " + "4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, " + "sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, " + "sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne 1b @bne s3_max_loop_bot\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit\n" + "2: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext " + "v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext " + "v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, " + "q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, " + "d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, " + "d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "bne 2b @bne s3_max_loop_bot_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1), + [vcoef] "+w"(vcoef), [vzero] "+w"(vzero) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; + for (int i = wstart; i < wend; i++) { // only run 1 or 2 times + tmp += (r0[i] + r1[i]); + } + data_out_channel[w_even >> 1] = tmp / 9.f; + } + } + } + } + } +} + +void pooling3x3s2p0_max(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int w_in = win; + int h_in = hin; + int ch_in = chin; + + int w_out = wout; + int h_out = hout; + int ch_out = chout; + + int kernel_h = ksize[0]; + int kernel_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + + int size_channel_out = w_out * h_out; + int size_channel_in = w_in * h_in; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int pad_top = pad_h; + int pad_left = pad_w; + int w_needed = w_out * 2 + 1; + int h_needed = h_out * 2 + 1; + int pad_right = w_needed - w_in - pad_left; + int pad_bottom = h_needed - h_in - pad_top; + int w_even = ((w_in - 1) >> 1) << 1; + // int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = ((h_in - 1) >> 1) << 1; + // int h_remains = h_in - h_even; // should be 0 or 1 + int w_unroll_size = w_in >> 3; + int w_unroll_remian = (w_in - w_unroll_size * 8 - 1) / 2; + int w_in_2 = w_in << 1; + float minval = std::numeric_limits::lowest(); + float32x4_t vzero = vdupq_n_f32(minval); // zero pad + // printf("minval: %.2f\n", minval); + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * ch_out * size_channel_out; + const float* data_in_batch = data_in + n * ch_in * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < ch_out; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + w_in; + const float* r2 = r1 + w_in; + int cnt_num = w_unroll_size; + // w = w_in - 8; + int cnt_num1 = w_unroll_remian; + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 0; + int cnt = 0; + // data_out_channel[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], + // r1[1])); + // first row with zero pad + // r0 = r1; + // r1 = r0 + w_in; + // r2 = r1 + w_in; + // data_out_channel += w_out; + int h = 0; + for (; h < h_even; h += 2) { + // deal with left pad + float maxr0 = std::max(r0[0], r0[1]); + float maxr1 = std::max(r1[0], r1[1]); + float maxr2 = std::max(r2[0], r2[1]); +// data_out_channel[0] = std::max(std::max(maxr0, maxr1), maxr2); +#ifdef __aarch64__ + w = 0; + cnt = 0; + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = + vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = + vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt], vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2], vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + float32x4_t vr2 = vld1q_f32(&r2[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + vr2 = vsetq_lane_f32(minval, vr2, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + vmax1 = vmaxq_f32(vmax1, vr2); + float32x2_t vmax2 = + vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + float32x2_t vmax = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax, 0); + cnt++; + } +#else + dr_out = data_out_channel; // + 1; + dr0 = r0; // (r0 + 1); + dr1 = r1; // (r1 + 1); + dr2 = r2; // (r2 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d10}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d16}, [%[dr2]]! @load d4-d7, dr1\n" + "vmax.f32 q9, q0, q3 @max q0,q0,q2\n" + "vmax.f32 q10, q1, q4 @max q1,q1,q3\n" + "vmax.f32 d22, d4, d10 @max q1,q1,q3\n" + "vmax.f32 q0, q9, q6 @max q0,q0,q2 " + "1234\n" + "vmax.f32 q3, q10, q7 @max q1,q1,q3 " + "5678\n" + "vmax.f32 d2, d22, d16 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q4, q0, q3, #1 @vext 2345\n" + "vext.f32 q2, q3, q1, #1 @vext 6789\n" + "vpmax.f32 d10, d0, d1 @pmax d10, " + "vmax_1234, vmax_1234\n" + "vpmax.f32 d12, d6, d7 @pmax d12, " + "vmax_5678, vmax_5678\n" + "vpmax.f32 d11, d8, d9 @pmax d11, " + "vmax_2345, vmax_2345\n" + "vpmax.f32 d13, d4, d5 @pmax d13, " + "vmax_6789, vmax_6789\n" + "vmax.f32 d0, d10, d11 @pmax d0, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d1, d12, d13 @pmax d1, " + "vmax_56_78, vmax_67_89\n" + "sub %[dr0], #8 @add w, 8\n" + "sub %[dr1], #8 @add w, 8\n" + "sub %[dr2], #8 @add w, 8\n" + "vst1.f32 d0, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d1, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "bne 1b @bne s3_max_loop_mid\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit1\n" + "2: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, " + "dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmov.f32 s11,s10 @movs11, s10\n" + "vmax.f32 q0, q0, q1 @max q0, q0, " + "q1\n" + "vmax.f32 q0, q0, q2 @max q0, q0, " + "q2\n" + "vpmax.f32 d0, d0, d1 @pmax d0, " + "d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, " + "d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, " + "#1\n" + "bne 2b @bne s3_max_loop_mid_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num), + [cnt_num1] "+r"(cnt_num1) + : "r"(dr0), "r"(dr1), "r"(dr2), "r"(dr_out), "r"(cnt_num), + "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, w_in + pad_w), w_in); + float tmp = r0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { + tmp = std::max(tmp, std::max(r0[i], r1[i])); + tmp = std::max(tmp, r2[i]); + } + data_out_channel[w_even >> 1] = tmp; + // cnt ++; + } + r0 = r2; + r1 = r0 + w_in; + r2 = r1 + w_in; + data_out_channel += w_out; + } + + if (pad_bottom) { +// deal with bottom pad +// first row with zero pad +// int hstart = (h >> 1) * stride_h - pad_h; +// int hend = std::min(std::min(hstart + kernel_h, h_in + pad_h),h_in); +// data_out_channel[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], +// r1[1])); +#ifdef __aarch64__ + w = 0; + cnt = 0; + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); + float32x2_t vmax_12_34 = + vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = + vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = + vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = + vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt], vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2], vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + float32x2_t vmax2 = + vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + vmax2 = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax2, 0); + cnt++; + } +#else + dr_out = data_out_channel; // + 1; + dr0 = r0; // (r0 + 1); + dr1 = r1; // (r1 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 3f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4}, [%[dr0]]! @load d0-d3, dr0\n" + "vld1.f32 {d10}, [%[dr1]]! @load d4-d7, dr1\n" + "vmax.f32 q6, q0, q3 @max q0,q0,q2 " + "1234\n" + "vmax.f32 q7, q1, q4 @max q1,q1,q3 " + "5678\n" + "vmax.f32 d16, d4, d10 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext q0, 2345\n" + "vext.f32 q1, q7, q8, #1 @vext q1, 6789\n" + "vpmax.f32 d4, d12, d13 @pmax d4, " + "vmax_1234, vmax_1234\n" + "vpmax.f32 d6, d14, d15 @pmax d6, " + "vmax_5678, vmax_5678\n" + "vpmax.f32 d5, d0, d1 @pmax d5, " + "vmax_2345, vmax_2345\n" + "vpmax.f32 d7, d2, d3 @pmax d7, " + "vmax_6789, vmax_6789\n" + "vmax.f32 d8, d4, d5 @max d2, " + "vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d6, d7 @max d2, " + "vmax_56_78, vmax_67_89\n" + "sub %[dr0], #8 @add w, 8\n" + "sub %[dr1], #8 @add w, 8\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "bne 1b @bne s3_max_loop_bot\n" + "3: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 4f @ble exit\n" + "2: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmax.f32 q0, q0, q1 @max q0, q0, " + "q1\n" + "vpmax.f32 d0, d0, d1 @pmax d0, " + "d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, " + "d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "bne 2b @bne s3_max_loop_bot_1\n" + "4: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, w_in + pad_w), w_in); + float tmp = r0[wstart]; // std::numeric_limits::min(); + for (int i = wstart; i < wend; i++) { // only run 1 or 2 times + tmp = std::max(tmp, std::max(r0[i], r1[i])); + } + data_out_channel[w_even >> 1] = tmp; + } + } + } + } +} + +void pooling3x3s2p0_ave(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type) { + int w_in = win; + int h_in = hin; + int ch_in = chin; + + int w_out = wout; + int h_out = hout; + int ch_out = chout; + + int kernel_h = ksize[0]; + int kernel_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + + int size_channel_out = w_out * h_out; + int size_channel_in = w_in * h_in; + float* data_out = static_cast(dout); + const float* data_in = static_cast(din); + + int pad_top = pad_h; + int pad_left = pad_w; + int w_needed = w_out * 2 + 1; + int h_needed = h_out * 2 + 1; + int pad_right = w_needed - w_in - pad_left; + int pad_bottom = h_needed - h_in - pad_top; + int w_even = ((w_in - 1) >> 1) << 1; + int h_even = ((h_in - 1) >> 1) << 1; + int w_in_2 = w_in << 1; + int w_unroll_size = w_in >> 3; + int w_unroll_remian = (w_even - w_unroll_size * 8 - 1) / 2; + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * ch_out * size_channel_out; + const float* data_in_batch = data_in + n * ch_in * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < ch_out; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + w_in; + const float* r2 = r1 + w_in; + int cnt_num = w_unroll_size; + // w = w_in - 8; + int cnt_num1 = w_unroll_remian; + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + + float32x4_t vcoef = vdupq_n_f32(1.f / 9.f); + float32x4_t vzero = vdupq_n_f32(0.f); + + int h = 0; + for (; h < h_even; h += 2) { +// LOG(INFO) << "h: " << h<<", dr0:" << r0 <<", dr1: "< 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble loop3_ave_p0 @ble " + "exit\n" + "s3_ave_loop_mid_p0: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d10}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d16}, [%[dr2]]! @load d4-d7, dr1\n" + "vadd.f32 q9, q0, q3 @max q0,q0,q2\n" + "vadd.f32 q10, q1, q4 @max q1,q1,q3\n" + "vadd.f32 d22, d4, d10 @max q1,q1,q3\n" + "vadd.f32 q6, q9, q6 @max q0,q0,q2 " + "1234\n" + "vadd.f32 q7, q10, q7 @max q1,q1,q3 " + "5678\n" + "vadd.f32 d16, d22, d16 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 " + "\n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 " + "\n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 " + "\n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 " + "\n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 8\n" + "sub %[dr1], #8 @add w, 8\n" + "sub %[dr2], #8 @add w, 8\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne s3_ave_loop_mid_p0 @bne " + "s3_max_loop_mid\n" + "loop3_ave_p0: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble exit1_ave_p0 @ble " + "exit1\n" + "s3_ave_loop_mid_1_p0: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, " + "dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vext.f32 q2, %q[vzero], q2, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, " + "q1\n" + "vadd.f32 q0, q0, q2 @add q0, q0, " + "q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, " + "d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, " + "d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, " + "#1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "bne s3_ave_loop_mid_1_p0 @bne " + "s3_max_loop_mid_1\n" + "exit1_ave_p0: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr2] "+r"(dr2), + [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num), + [cnt_num1] "+r"(cnt_num1), [vcoef] "+w"(vcoef), + [vzero] "+w"(vzero) + : "r"(dr0), "r"(dr1), "r"(dr2), "r"(dr_out), "r"(cnt_num), + "r"(cnt_num1) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12"); + } +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, w_in + pad_w), w_in); + float tmp = 0.f; + int pool_size = 3 * (wend - wstart); + for (int i = wstart; i < wend; i++) { + tmp += (r0[i] + r1[i] + r2[i]); + } + data_out_channel[w_even >> 1] = tmp / pool_size; + // cnt ++; + } + r0 = r2; + r1 = r0 + w_in; + r2 = r1 + w_in; + data_out_channel += w_out; + } + + if (pad_bottom) { +// deal with bottom pad +// first row with zero pad +// int hstart = (h >> 1) * stride_h - pad_h; +// int hend = std::min(std::min(hstart + kernel_h, h_in + pad_h),h_in); +// data_out_channel[0] =(r0[0] + r0[1] + r1[0] + r1[1]) / 9.f; +#if 1 // def __aarch64__ + int w = 0; + int cnt = 0; + vcoef = vdupq_n_f32(1.f / 6.f); + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); + vsum_123_345 = + vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt], vrst); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + float32x2_t vsum2 = + vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + vsum2 = vpadd_f32(vsum2, vsum2); + float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef)); + data_out_channel[cnt] = vget_lane_f32(vrst, 0); + cnt++; + } +#else + dr_out = data_out_channel; // + 1; + dr0 = r0; // (r0 + 1); + dr1 = r1; // (r1 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + // LOG(INFO) << "dr0:" << dr0 <<", dr1: "< 0 || cnt_num1 > 0) { + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, " + "0\n" + "ble 2f @ble exit\n" + "1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " + "dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, " + "dr1\n" + "vld1.f32 {d4}, [%[dr0]]! @load d0-d3, dr0\n" + "vld1.f32 {d10}, [%[dr1]]! @load d4-d7, dr1\n" + "vadd.f32 q6, q0, q3 @max q0,q0,q2 " + "1234\n" + "vadd.f32 q7, q1, q4 @max q1,q1,q3 " + "5678\n" + "vadd.f32 d16, d4, d10 @max q1,q1,q3 " + "9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 " + "\n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 " + "\n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 " + "\n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 " + "\n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 8\n" + "sub %[dr1], #8 @add w, 8\n" + "subs %[cnt_num], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, " + "dr_out\n" + "bne 1b @bne s3_max_loop_bot\n" + "2: @loop \n" + "cmp %[cnt_num1], #0 @cmp " + "cnt_num, 0\n" + "ble 3f @ble exit\n" + "4: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, " + "dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, " + "dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, " + "q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, " + "d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, " + "d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs " + "cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], " + "dr_out\n" + "bne 4b @bne s3_max_loop_bot_1\n" + "3: @exit\n" + : [dr0] "+r"(dr0), [dr1] "+r"(dr1), [dr_out] "+r"(dr_out), + [cnt_num] "+r"(cnt_num), [cnt_num1] "+r"(cnt_num1), + [vcoef] "+w"(vcoef), [vzero] "+w"(vzero) + : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num1) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"); + } + +#endif + if (pad_right) { + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, w_in + pad_w), w_in); + float tmp = 0.f; + int pool_size = 2 * (wend - wstart); + for (int i = wstart; i < wend; i++) { // only run 1 or 2 times + tmp += (r0[i] + r1[i]); + } + data_out_channel[w_even >> 1] = tmp / pool_size; + } + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/arm/math/pooling.h b/paddle/fluid/lite/arm/math/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..36832187073c2d29a129a10fdd7984ba8d15db3d --- /dev/null +++ b/paddle/fluid/lite/arm/math/pooling.h @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +// !pooling fp32 Op +void pooling_basic(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling_global(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling2x2s2_max(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling2x2s2_ave(const void* din, void* dout, int num, int chout, int hout, + int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling3x3s1p1_max(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling3x3s1p1_ave(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling3x3s2p1_max(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling3x3s2p0_max(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling3x3s2p1_ave(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +void pooling3x3s2p0_ave(const void* din, void* dout, int num, int chout, + int hout, int wout, int chin, int hin, int win, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool global_pooling, + bool exclusive, bool adaptive, bool ceil_mode, + bool use_quantizer, const std::string& pooling_type); + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/arm/math/scale.cc b/paddle/fluid/lite/arm/math/scale.cc index 40b91e6979f6f330f96f4c086fe1856707d9b189..ce969358f689ef7713efb435ce58ba72471d282b 100644 --- a/paddle/fluid/lite/arm/math/scale.cc +++ b/paddle/fluid/lite/arm/math/scale.cc @@ -58,6 +58,111 @@ void scale(const float* din, float* dout, int num, float scale, } } +template <> +void scale(const float* din, float* dout, int outer_dim, int scale_dim, + int inner_dim, const float* scale_data, + const float* bias_data) { + int cnt = inner_dim >> 4; + int remain = inner_dim % 16; + int size = inner_dim * scale_dim; + for (int n = 0; n < outer_dim; n++) { + const float* din_ptr_n = din + n * size; + float* dout_ptr_n = dout + n * size; +#pragma omp parallel for + for (int i = 0; i < scale_dim; i++) { + const float* din_ptr = din_ptr_n + i * inner_dim; + float* dout_ptr = dout_ptr_n + i * inner_dim; + float scale = scale_data[i]; + float32x4_t vscale = vdupq_n_f32(scale); + float bias = bias_data[i]; + float32x4_t vbias = vdupq_n_f32(bias); + for (int j = 0; j < cnt; j++) { + float32x4_t din0 = vld1q_f32(din_ptr); + float32x4_t din1 = vld1q_f32(din_ptr + 4); + float32x4_t din2 = vld1q_f32(din_ptr + 8); + float32x4_t din3 = vld1q_f32(din_ptr + 12); + + float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); + float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); + float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); + float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); + + din_ptr += 16; + vst1q_f32(dout_ptr, vsum1); + vst1q_f32(dout_ptr + 4, vsum2); + vst1q_f32(dout_ptr + 8, vsum3); + vst1q_f32(dout_ptr + 12, vsum4); + + dout_ptr += 16; + } + for (int j = 0; j < remain; j++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } + } +} + +template <> +void scale(const float* din, float* dout, int outer_dim, int scale_dim, + const float* scale_data, const float* bias_data) { + int cnt = scale_dim >> 4; + int remain = scale_dim % 16; + for (int n = 0; n < outer_dim; n++) { + const float* din_ptr_n = din + n * scale_dim; + float* dout_ptr_n = dout + n * scale_dim; +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + int idx = i << 4; + const float* din_ptr = din_ptr_n + idx; + const float* scale_ptr = scale_data + idx; + const float* bias_ptr = bias_data + idx; + float* dout_ptr = dout_ptr_n + idx; + + float32x4_t din0 = vld1q_f32(din_ptr); + float32x4_t vscale0 = vld1q_f32(scale_ptr); + float32x4_t vbias0 = vld1q_f32(bias_ptr); + + float32x4_t din1 = vld1q_f32(din_ptr + 4); + float32x4_t vscale1 = vld1q_f32(scale_ptr + 4); + float32x4_t vbias1 = vld1q_f32(bias_ptr + 4); + + float32x4_t din2 = vld1q_f32(din_ptr + 8); + float32x4_t vscale2 = vld1q_f32(scale_ptr + 8); + float32x4_t vbias2 = vld1q_f32(bias_ptr + 8); + + float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0); + float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1); + + float32x4_t din3 = vld1q_f32(din_ptr + 12); + float32x4_t vscale3 = vld1q_f32(scale_ptr + 12); + float32x4_t vbias3 = vld1q_f32(bias_ptr + 12); + + vst1q_f32(dout_ptr, vsum1); + vst1q_f32(dout_ptr + 4, vsum2); + + float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2); + float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3); + + vst1q_f32(dout_ptr + 8, vsum3); + vst1q_f32(dout_ptr + 12, vsum4); + } + int idx = cnt << 4; + const float* din_ptr = din_ptr_n + idx; + float* dout_ptr = dout_ptr_n + idx; + const float* scale_ptr = scale_data + idx; + const float* bias_ptr = bias_data + idx; + for (int j = 0; j < remain; j++) { + *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr); + dout_ptr++; + din_ptr++; + scale_ptr++; + bias_ptr++; + } + } +} + } // namespace math } // namespace arm } // namespace lite diff --git a/paddle/fluid/lite/arm/math/scale.h b/paddle/fluid/lite/arm/math/scale.h index 97a5f79fc6bfabee5e38854e2ba89ce388648aac..2274dd23d2f4f486e39b97ad5040bde47af8a042 100644 --- a/paddle/fluid/lite/arm/math/scale.h +++ b/paddle/fluid/lite/arm/math/scale.h @@ -22,6 +22,14 @@ namespace math { template void scale(const T* din, T* dout, int num, float scale, float bias); +template +void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim, + const float* scale_data, const float* bias_data); + +template +void scale(const T* din, T* dout, int outer_dim, int scale_dim, + const float* scale_data, const float* bias_data); + } // namespace math } // namespace arm } // namespace lite diff --git a/paddle/fluid/lite/arm/math/split.cc b/paddle/fluid/lite/arm/math/split.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf8d50590ff89c451347e33a289391b8d929e5b6 --- /dev/null +++ b/paddle/fluid/lite/arm/math/split.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/arm/math/split.h" +#include +#include "paddle/fluid/lite/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +template <> +void split_cpy(const float* din, float* dout, int num) { + int cnt = num >> 4; + int remain = num % 16; +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const float* din_ptr = din + (i << 4); + float* dout_ptr = dout + (i << 4); + + float32x4_t din0 = vld1q_f32(din_ptr); + float32x4_t din1 = vld1q_f32(din_ptr + 4); + float32x4_t din2 = vld1q_f32(din_ptr + 8); + float32x4_t din3 = vld1q_f32(din_ptr + 12); + + vst1q_f32(dout_ptr, din0); + vst1q_f32(dout_ptr + 4, din1); + vst1q_f32(dout_ptr + 8, din2); + vst1q_f32(dout_ptr + 12, din3); + } + if (remain > 0) { + const float* din_ptr = din + (cnt << 4); + float* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void split(const float* din, const std::vector& dout, + const int axis, const std::vector& in_strides) { + int input_offset = 0; + for (auto out : dout) { + auto out_dim = out->dims(); + std::vector out_strides(out_dim.size()); + out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1]; + for (int i = out_dim.size() - 2; i >= 0; --i) { + out_strides[i] = out_strides[i + 1] * out_dim[i]; + } + + float* out_data = out->mutable_data(); + int before = out_strides[0] / out_strides[axis]; + int in_after = in_strides[axis]; + int out_after = out_strides[axis]; + + for (int i = 0; i < before; ++i) { + split_cpy(din + input_offset + i * in_after, out_data + i * out_after, + out_after); + } + input_offset += out_strides[axis]; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/arm/math/split.h b/paddle/fluid/lite/arm/math/split.h new file mode 100644 index 0000000000000000000000000000000000000000..643214e174c3ede02f430ee4ded7cee097ba0afc --- /dev/null +++ b/paddle/fluid/lite/arm/math/split.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +template +void split_cpy(const T* din, T* dout, int num); + +template +void split(const T* din, const std::vector& dout, const int axis, + const std::vector& in_strides); + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/arm/math/type_trans.cpp b/paddle/fluid/lite/arm/math/type_trans.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f9c3ea590f394d226bee675ae793097b7afa031d --- /dev/null +++ b/paddle/fluid/lite/arm/math/type_trans.cpp @@ -0,0 +1,579 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/arm/math/type_trans.h" +#include +#include +#include "paddle/fluid/lite/arm/math/saturate.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +template +void int32_to_dtype(const int* din, dtype* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size); + +void fp32_to_int8(const float* din, signed char* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = outer_size * axis_size; + +#pragma omp parallel for + for (int j = 0; j < loop_size; ++j) { + float inv_scale = 1.f / scale[j % axis_size]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vscale = vdupq_n_f32(inv_scale); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + const float* din_c = din + j * inner_size; + signed char* dout_c = dout + j * inner_size; + if (cnt > 0) { + int cnt_loop = cnt; + const float* din_ptr = din_c; + signed char* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "ldp q2, q3, [%[in]], #32 \n" + "0: \n" /* main loop */ + "fmul v4.4s, v0.4s, %[scale].4s \n" + "fmul v5.4s, v1.4s, %[scale].4s \n" + "fmul v6.4s, v2.4s, %[scale].4s \n" + "fmul v7.4s, v3.4s, %[scale].4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "subs %[cnt], %[cnt], #1 \n" + "FCVTAS v8.4s, v4.4s \n" + "FCVTAS v9.4s, v5.4s \n" + "FCVTAS v10.4s, v6.4s \n" + "FCVTAS v11.4s, v7.4s \n" + "ldp q2, q3, [%[in]], #32 \n" + "sqxtn v4.4h, v8.4s \n" + "sqxtn2 v4.8h, v9.4s \n" + "sqxtn v5.4h, v10.4s \n" + "sqxtn2 v5.8h, v11.4s \n" + "sqxtn v8.8b, v4.8h \n" + "sqxtn2 v8.16b, v5.8h \n" + "str q8, [%[out]], #16 \n" + "bne 0b \n" + : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "0: @ main loop\n" + "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q5, q4, q4 @ set offset, 0.5\n" + "vand.i32 q6, q4, q4 @ set offset, 0.5\n" + "vand.i32 q7, q4, q4 @ set offset, 0.5\n" + "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" + "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" + "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" + "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" + "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" + "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" + "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" + "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q0, q4 @ cvt to int32\n" + "vcvt.s32.f32 q1, q5 @ cvt to int32\n" + "vcvt.s32.f32 q2, q6 @ cvt to int32\n" + "vcvt.s32.f32 q3, q7 @ cvt to int32\n" + "vqmovn.s32 d8, q0 @ cnt to int16\n" + "vqmovn.s32 d9, q1 @ cnt to int16\n" + "vqmovn.s32 d10, q2 @ cnt to int16\n" + "vqmovn.s32 d11, q3 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vqmovn.s16 d12, q4 @ cnt to int8\n" + "vqmovn.s16 d13, q5 @ cnt to int8\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + + : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) + : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), + [vzero] "w"(vzero) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11"); +#endif + } + const float* din_r = din_c + 16 * cnt; + signed char* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); + } + } +} + +void fp32_to_int16(const float* din, int16_t* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 8; + int remain = inner_size & 7; + int64_t loop_size = outer_size * axis_size; + +#pragma omp parallel for + for (int j = 0; j < loop_size; ++j) { + float inv_scale = 1.f / scale[j % axis_size]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vscale = vdupq_n_f32(inv_scale); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + const float* din_c = din + j * inner_size; + int16_t* dout_c = dout + j * inner_size; + if (cnt > 0) { + int cnt_loop = cnt; + const float* din_ptr = din_c; + int16_t* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "0: \n" /* main loop */ + "fmul v4.4s, v0.4s, %[scale].4s \n" + "fmul v5.4s, v1.4s, %[scale].4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "subs %[cnt], %[cnt], #1 \n" + "FCVTAS v8.4s, v4.4s \n" + "FCVTAS v9.4s, v5.4s \n" + "sqxtn v4.4h, v8.4s \n" + "sqxtn2 v4.8h, v9.4s \n" + "str q4, [%[out]], #16 \n" + "bne 0b \n" + : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) + : [scale] "w"(vscale) + : "v0", "v1", "v4", "v5", "v8", "v9"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "0: @ main loop\n" + "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q5, q4, q4 @ set offset, 0.5\n" + "vand.i32 q6, q4, q4 @ set offset, 0.5\n" + "vand.i32 q7, q4, q4 @ set offset, 0.5\n" + "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" + "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" + "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q0, q4 @ cvt to int32\n" + "vcvt.s32.f32 q1, q5 @ cvt to int32\n" + "vqmovn.s32 d8, q0 @ cnt to int16\n" + "vqmovn.s32 d9, q1 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + + : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) + : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), + [vzero] "w"(vzero) + : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); +#endif + } + const float* din_r = din_c + 8 * cnt; + int16_t* dout_r = dout_c + 8 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); + } + } +} + +void int8_to_fp32(const signed char* in, float* out, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = axis_size * outer_size; +#pragma omp parallel for + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const signed char* din_c = in + n * inner_size; + float* dout_c = out + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const signed char* din_ptr = din_c; + float* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ + "0: \n" /* main loop */ + "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ + "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ + + "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ + "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ + "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ + "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ + + "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ + + "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ + "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ + "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ + "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ + + "subs %[loop], %[loop], #1 \n" + + "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ + + "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ + "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11"); +#else + asm volatile( + "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" + "0: @ main loop\n" + "vmovl.s8 q2, d0 @ trans to int16\n" + "vmovl.s8 q3, d1 @ trans to int16\n" + "vmovl.s16 q4, d4 @ trans to int32\n" + "vmovl.s16 q5, d5 @ trans to int32\n" + "vmovl.s16 q6, d6 @ trans to int32\n" + "vmovl.s16 q7, d7 @ trans to int32\n" + "vcvt.f32.s32 q0, q4 @ trans to fp32\n" + "vcvt.f32.s32 q1, q5 @ trans to fp32\n" + "vcvt.f32.s32 q2, q6 @ trans to fp32\n" + "vcvt.f32.s32 q3, q7 @ trans to fp32\n" + "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" + "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" + "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" + "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" + + "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" + + "subs %[loop], #1 \n" + + "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" + "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif // __aarch64__ + } + const signed char* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } +} + +void int16_to_fp32(const int16_t* in, float* out, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = axis_size * outer_size; +#pragma omp parallel for + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int16_t* din_c = in + n * inner_size; + float* dout_c = out + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const int16_t* din_ptr = din_c; + float* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ + "0: \n" /* main loop */ + "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ + "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ + "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ + "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ + + "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ + + "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ + "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ + "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ + "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ + + "subs %[loop], %[loop], #1 \n" + + "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ + + "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ + "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" + "0: @ main loop\n" + "vmovl.s16 q4, d0 @ trans to int32\n" + "vmovl.s16 q5, d1 @ trans to int32\n" + "vmovl.s16 q6, d2 @ trans to int32\n" + "vmovl.s16 q7, d3 @ trans to int32\n" + "vcvt.f32.s32 q0, q4 @ trans to fp32\n" + "vcvt.f32.s32 q1, q5 @ trans to fp32\n" + "vcvt.f32.s32 q2, q6 @ trans to fp32\n" + "vcvt.f32.s32 q3, q7 @ trans to fp32\n" + "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" + "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" + "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" + "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" + + "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" + + "subs %[loop], #1 \n" + + "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" + "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif // __aarch64__ + } + const int16_t* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } +} + +void int32_to_fp32(const int* din, float* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = axis_size * outer_size; +#pragma omp parallel for + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int* din_c = din + n * inner_size; + float* dout_c = dout + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const int* din_ptr = din_c; + float* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "ldp q2, q3, [%[in]], #32 \n" + "0: \n" + "scvtf v4.4s, v0.4s \n" + "scvtf v5.4s, v1.4s \n" + "scvtf v6.4s, v2.4s \n" + "scvtf v7.4s, v3.4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "fmul v8.4s, v4.4s, %[scale].4s \n" + "fmul v9.4s, v5.4s, %[scale].4s \n" + "fmul v10.4s, v6.4s, %[scale].4s \n" + "fmul v11.4s, v7.4s, %[scale].4s \n" + "ldp q2, q3, [%[in]], #32 \n" + "stp q8, q9, [%[out]], #32 \n" + "stp q10, q11, [%[out]], #32 \n" + "subs %[loop], %[loop], #1 \n" + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11"); +#else + asm volatile( + "vld1.s32 {d0-d3}, [%[in]]! \n" + "vld1.s32 {d4-d7}, [%[in]]! \n" + "0: \n" + "vcvt.f32.s32 q4, q0 \n" + "vcvt.f32.s32 q5, q1 \n" + "vcvt.f32.s32 q6, q2 \n" + "vcvt.f32.s32 q7, q3 \n" + "vld1.s32 {d0-d3}, [%[in]]! \n" + "vmul.f32 q8, q4, %q[scale] \n" + "vmul.f32 q9, q5, %q[scale] \n" + "vmul.f32 q10, q6, %q[scale] \n" + "vmul.f32 q11, q7, %q[scale] \n" + "vld1.s32 {d4-d7}, [%[in]]! \n" + "subs %[loop], #1 \n" + "vst1.f32 {d16-d19}, [%[out]]! \n" + "vst1.f32 {d20-d23}, [%[out]]! \n" + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11"); +#endif // __aarch64__ + } + const int* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } +} + +void int32_to_int8(const int* din, signed char* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = outer_size * axis_size; +#pragma omp parallel for + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int* din_c = din + n * inner_size; + signed char* dout_c = dout + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + if (cnt > 0) { + int loop = cnt; + const int* din_ptr = din_c; + signed char* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "0: \n" + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + + "scvtf v4.4s, v0.4s \n" + "scvtf v5.4s, v1.4s \n" + "scvtf v6.4s, v2.4s \n" + "scvtf v7.4s, v3.4s \n" + + "fmul v0.4s, v4.4s, %[scale].4s \n" + "fmul v1.4s, v5.4s, %[scale].4s \n" + "fmul v2.4s, v6.4s, %[scale].4s \n" + "fmul v3.4s, v7.4s, %[scale].4s \n" + + "fcvtas v4.4s, v0.4s \n" + "fcvtas v5.4s, v1.4s \n" + "fcvtas v6.4s, v2.4s \n" + "fcvtas v7.4s, v3.4s \n" + + "sqxtn v0.4h, v4.4s \n" + "sqxtn2 v0.8h, v5.4s \n" + "sqxtn v1.4h, v6.4s \n" + "sqxtn2 v1.8h, v7.4s \n" + + "sqxtn v2.8b, v0.8h \n" + "sqxtn2 v2.16b, v1.8h \n" + + "st1 {v2.16b}, [%[out]], #16 \n" + "subs %[loop], %[loop], #1 \n" + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "0: @ main loop\n" + "vcvt.f32.s32 q4, q0 @ cvt to float\n" + "vcvt.f32.s32 q5, q1 @ cvt to float\n" + "vcvt.f32.s32 q6, q2 @ cvt to float\n" + "vcvt.f32.s32 q7, q3 @ cvt to float\n" + "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q1, q0, q0 @ set offset, 0.5\n" + "vand.i32 q2, q0, q0 @ set offset, 0.5\n" + "vand.i32 q3, q0, q0 @ set offset, 0.5\n" + "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" + "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" + "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" + "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" + "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" + "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" + "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" + "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" + "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q4, q0 @ cvt to int32\n" + "vcvt.s32.f32 q5, q1 @ cvt to int32\n" + "vcvt.s32.f32 q6, q2 @ cvt to int32\n" + "vcvt.s32.f32 q7, q3 @ cvt to int32\n" + "vqmovn.s32 d16, q4 @ cnt to int16\n" + "vqmovn.s32 d17, q5 @ cnt to int16\n" + "vqmovn.s32 d18, q6 @ cnt to int16\n" + "vqmovn.s32 d19, q7 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vqmovn.s16 d8, q8 @ cnt to int8\n" + "vqmovn.s16 d9, q9 @ cnt to int8\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" + "subs %[loop], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) + : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff), + [vpoff] "w"(vpoff) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11"); +#endif // __aarch64__ + } + const int* din_r = din_c + 16 * cnt; + int8_t* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); + } + } +} + +void int32_to_int32(const int* din, int* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int size_all = outer_size * axis_size * inner_size; + memmove(dout, din, size_all * sizeof(int)); +} + +template <> +void int32_to_dtype(const int* din, float* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); +} + +template <> +void int32_to_dtype(const int* din, signed char* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); +} + +template <> +void int32_to_dtype(const int* din, int* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt index d3ec7df1119e4e13b689e663efdae7dbe30a9f8c..227216990fc3af39529c40ffc14d06339ca20047 100644 --- a/paddle/fluid/lite/core/CMakeLists.txt +++ b/paddle/fluid/lite/core/CMakeLists.txt @@ -57,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite) lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite) lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator) + diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc index df80f1c857688fd6fb76350e720effef0f3c15f6..ab1968295813006d5d11fc4fbf416b4f9c3a3215 100644 --- a/paddle/fluid/lite/core/cpu_info.cc +++ b/paddle/fluid/lite/core/cpu_info.cc @@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) { << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] << ", CPU ARCH: A" << dev->archs_[i]; } - LOG(INFO) << "L1 DataCache size is: "; + VLOG(1) << "L1 DataCache size is: "; for (int i = 0; i < dev->compute_core_num_; ++i) { - LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB"; + VLOG(1) << dev->L1_cache_[i] / 1024 << " KB"; } - LOG(INFO) << "L2 Cache size is: "; + VLOG(1) << "L2 Cache size is: "; for (int i = 0; i < dev->compute_core_num_; ++i) { - LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB"; + VLOG(1) << dev->L2_cache_[i] / 1024 << " KB"; } - LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB"; + VLOG(1) << "Total memory: " << dev->max_memory_ << "KB"; dev->max_freq_ = max_freq[0]; for (int j = 1; j < dev->compute_core_num_; ++j) { diff --git a/paddle/fluid/lite/core/hvy_tensor.h b/paddle/fluid/lite/core/hvy_tensor.h index 16172a80035e6512244f0bccd91ff2f5d2553f0d..748e80c2559718d278a08e3c568532e177c835eb 100644 --- a/paddle/fluid/lite/core/hvy_tensor.h +++ b/paddle/fluid/lite/core/hvy_tensor.h @@ -107,6 +107,8 @@ class TensorHvy : public TensorBase { data_.Resize(framework::make_ddim(dims.Vectorize())); } + void Resize(const std::vector& x) { Resize(DDimHvy(x)); } + void ShareDataWith(const TensorHvy& other) { data_.ShareDataWith(other.data_); } diff --git a/paddle/fluid/lite/core/memory.h b/paddle/fluid/lite/core/memory.h index 5948f6c4a854d9f678c316f351c017788c44c4a2..6b019abc19d4e0e0add32b23d3f39820b8b47588 100644 --- a/paddle/fluid/lite/core/memory.h +++ b/paddle/fluid/lite/core/memory.h @@ -65,6 +65,8 @@ class Buffer { TargetCopy(target_, data_, other.data_, nbytes); } + ~Buffer() { Free(); } + private: // memory it actually malloced. size_t space_{0}; diff --git a/paddle/fluid/lite/core/mir/CMakeLists.txt b/paddle/fluid/lite/core/mir/CMakeLists.txt index 86303976e1248583e7862634e1d9b2ca0426c771..c3d3df9c6778eee53bf6492f4c4bfae36ae80687 100644 --- a/paddle/fluid/lite/core/mir/CMakeLists.txt +++ b/paddle/fluid/lite/core/mir/CMakeLists.txt @@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite mir_passes compatible_pb_lite program_lite ${ops_lite}) endif() + diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc b/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b082060fe21731000394f6941e0803af7da74d6 --- /dev/null +++ b/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/core/mir/pattern_matcher.h" + +#include + +namespace paddle { +namespace lite { +namespace mir { + +void BuildGraph(SSAGraph* g) { + g->mutable_nodes().emplace_back(); + Node& o1 = g->mutable_nodes().back(); + o1.AsStmt().op_type = "op1"; + g->mutable_nodes().emplace_back(); + Node& o2 = g->mutable_nodes().back(); + o2.AsStmt().op_type = "op2"; + g->mutable_nodes().emplace_back(); + Node& o3 = g->mutable_nodes().back(); + o3.AsStmt().op_type = "op3"; + g->mutable_nodes().emplace_back(); + Node& o4 = g->mutable_nodes().back(); + o4.AsStmt().op_type = "op4"; + g->mutable_nodes().emplace_back(); + Node& o5 = g->mutable_nodes().back(); + o5.AsStmt().op_type = "op5"; + g->mutable_nodes().emplace_back(); + Node& v1 = g->mutable_nodes().back(); + v1.AsArg("var1"); + g->mutable_nodes().emplace_back(); + Node& v2 = g->mutable_nodes().back(); + v2.AsArg("var2"); + g->mutable_nodes().emplace_back(); + Node& v3 = g->mutable_nodes().back(); + v3.AsArg("var3"); + g->mutable_nodes().emplace_back(); + Node& v4 = g->mutable_nodes().back(); + v4.AsArg("var4"); + + // o1->v1->o2 + o1.outlinks.push_back(&v1); + o2.inlinks.push_back(&v1); + v1.inlinks.push_back(&o1); + v1.outlinks.push_back(&o2); + // o2->v2->o3 + // o2->v2->o4 + o2.outlinks.push_back(&v2); + o3.inlinks.push_back(&v2); + o4.inlinks.push_back(&v2); + v2.inlinks.push_back(&o2); + v2.outlinks.push_back(&o3); + v2.outlinks.push_back(&o4); + // o2->v3->o5 + o2.outlinks.push_back(&v3); + o5.inlinks.push_back(&v3); + v3.inlinks.push_back(&o2); + v3.outlinks.push_back(&o5); + // o3-v4->o5 + o3.outlinks.push_back(&v4); + o5.inlinks.push_back(&v4); + v4.inlinks.push_back(&o3); + v4.outlinks.push_back(&o5); +} + +TEST(PMPattern, NewNode) { + PMPattern x; + auto* n = x.NewNode([](const Node* x) { return true; }); + ASSERT_TRUE(n); + ASSERT_EQ(x.nodes_.size(), 1UL); +} + +TEST(PMPattern, AddEdge) { + PMPattern x; + auto* a = x.NewNode([](const Node* x) { return true; }); + auto* b = x.NewNode([](const Node* x) { return true; }); + ASSERT_TRUE(a); + ASSERT_TRUE(b); + x.AddEdge(a, b); + ASSERT_EQ(x.nodes_.size(), 2UL); + ASSERT_EQ(x.edges_.size(), 1UL); + ASSERT_EQ(x.edges_.front().first, a); + ASSERT_EQ(x.edges_.front().second, b); + + ASSERT_EQ(x.nodes().size(), 2UL); + ASSERT_EQ(x.edges().size(), 1UL); + ASSERT_EQ(x.edges().front().first, a); + ASSERT_EQ(x.edges().front().second, b); +} + +TEST(PatternMatcher, MarkPMNodesInGraph) { + PatternMatcher x; + // mark o2, o3, v2 + + // The pattern is a graph: + // o2(a node named o2) -> v2(a node named v2) + // v2 -> o3(a node named o3) + auto* o2 = x.pattern_.NewNode([](const Node* node) { + // The teller can be any condition, such as op type, or variable's shape. + return node && node->IsStmt() && node->stmt()->op_type == "op2"; + }); + auto* o3 = x.pattern_.NewNode([](const Node* node) { + // The teller can be any condition, such as op type, or variable's shape. + return node && node->IsStmt() && node->stmt()->op_type == "op3"; + }); + auto* v2 = x.pattern_.NewNode([](const Node* node) { + // The teller can be any condition, such as op type, or variable's shape. + return node && node->IsArg() && node->arg()->name == "var2"; + }); + + ASSERT_FALSE(o2->Tell(nullptr)); + ASSERT_FALSE(o3->Tell(nullptr)); + ASSERT_FALSE(v2->Tell(nullptr)); + + x.pattern_.AddEdge(o2, v2); + x.pattern_.AddEdge(v2, o3); + + ASSERT_EQ(x.pattern_.edges().size(), 2UL); + ASSERT_EQ(x.pattern_.edges()[0].first, o2); + ASSERT_EQ(x.pattern_.edges()[0].second, v2); + ASSERT_EQ(x.pattern_.edges()[1].first, v2); + ASSERT_EQ(x.pattern_.edges()[1].second, o3); + + SSAGraph graph; + BuildGraph(&graph); + + x.MarkPMNodesInGraph(&graph); + + ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL); + + auto subgraphs = x.DetectPatterns(); + ASSERT_EQ(subgraphs.size(), 1UL); +} + +TEST(PatternMatcher, MultiSubgraph) { + SSAGraph graph; + BuildGraph(&graph); + + PatternMatcher x; + + // The pattern is a graph: + // op -> var + auto* any_op = x.mutable_pattern()->NewNode( + [](const Node* node) { + return node->IsStmt() && (node->stmt()->op_type == "op2" || + node->stmt()->op_type == "op3"); + }, + "OP0"); + auto* any_var = + x.mutable_pattern() + ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR") + ->AsIntermediate(); + auto* any_op1 = x.mutable_pattern()->NewNode( + [](const Node* node) { return node->IsStmt(); }, "OP1"); + + x.mutable_pattern()->AddEdge(any_op, any_var); + x.mutable_pattern()->AddEdge(any_var, any_op1); + + int count = 0; + PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s, + SSAGraph* g) { + LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> " + << s.at(any_var)->arg()->name << " -> " + << s.at(any_op1)->stmt()->op_type; + count++; + }; + + x(&graph, handle); + + // 1. Detect op3 -> var4 -> op5 + // 2. Detect op2 -> var2 -> op3 + // 3. Detect op2 -> var2 -> op4 + // 4. Detect op2 -> var3 -> op5 + // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2 + ASSERT_GE(count, 1); + ASSERT_LE(count, 2); +} + +TEST(PatternMatcher, IntermediateCheck) { + SSAGraph graph; + BuildGraph(&graph); + + // o2->v2->o3 + // o2->v2->o4 + // check o2+o3 fuse, should fail because v2 also link to o4. + PatternMatcher matcher; + auto* op2 = matcher.mutable_pattern()->NewNode( + [](const Node* x) { + return x && x->IsStmt() && x->stmt()->op_type == "op2"; + }, + "op2"); + auto* op3 = matcher.mutable_pattern()->NewNode( + [](const Node* x) { + return x && x->IsStmt() && x->stmt()->op_type == "op3"; + }, + "op3"); + auto* v2 = matcher.mutable_pattern() + ->NewNode( + [](const Node* x) { + return x && x->IsArg() && x->arg()->name == "var2"; + }, + "var2") + ->AsIntermediate(); + v2->LinksFrom({op2}).LinksTo({op3}); + + int count = 0; + matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) { + ++count; + }); + EXPECT_EQ(count, 0); + + count = 0; + v2->AsInput(); + matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) { + ++count; + }); + ASSERT_EQ(count, 1); +} + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/core/op_registry.h b/paddle/fluid/lite/core/op_registry.h index 49332262deb6552c5c9079ff93d691f464ed7028..1052419ecda8bcad8d919c0d8f8e2ab3f969440f 100644 --- a/paddle/fluid/lite/core/op_registry.h +++ b/paddle/fluid/lite/core/op_registry.h @@ -91,9 +91,9 @@ class KernelRegistry final { void Register(const std::string &name, typename KernelRegistryForTarget::creator_t &&creator) { - // VLOG(3) << "register for " << TargetToStr(Target) << ":" - //<< PrecisionToStr(Precision) << "//" - //<< GetKernelOffset(); + VLOG(3) << "register for " << TargetToStr(Target) << ":" + << PrecisionToStr(Precision) << "//" + << GetKernelOffset(); using kernel_registor_t = KernelRegistryForTarget; auto &varient = registries_[GetKernelOffset()]; @@ -153,6 +153,9 @@ class KernelRegistor : public lite::Registor { public: KernelRegistor(const std::string &op_type, const std::string &alias) : Registor([=] { + VLOG(3) << "Register kernel " << op_type << " for " + << TargetToStr(target) << " " << PrecisionToStr(precision) + << " " << DataLayoutToStr(layout) << " alias " << alias; KernelRegistry::Global().Register( op_type, [=]() -> std::unique_ptr { std::unique_ptr x(new KernelType); diff --git a/paddle/fluid/lite/core/profile/CMakeLists.txt b/paddle/fluid/lite/core/profile/CMakeLists.txt index 43731e8a414cff29b9ac4c681e4e0fd67a52603a..92ac495b6b6b35fce710a3d522ae139e2ce54e0a 100644 --- a/paddle/fluid/lite/core/profile/CMakeLists.txt +++ b/paddle/fluid/lite/core/profile/CMakeLists.txt @@ -4,3 +4,4 @@ endif() lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite) + diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h index d6980ff8898374a54393d0b3c2b9af995504e42a..27677e23a27366d052001a6828f12d1cfcc5decb 100644 --- a/paddle/fluid/lite/core/tensor.h +++ b/paddle/fluid/lite/core/tensor.h @@ -21,6 +21,7 @@ * looks the same. */ +#include #include #include "paddle/fluid/lite/core/target_wrapper.h" diff --git a/paddle/fluid/lite/cuda/CMakeLists.txt b/paddle/fluid/lite/cuda/CMakeLists.txt index 505759c7d4afef95423ce3815912794ae28255b0..9889b8b1aa02b9f886bf45aaf9b997f0043c3278 100644 --- a/paddle/fluid/lite/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/cuda/CMakeLists.txt @@ -4,3 +4,4 @@ endif() nv_library(target_wrapper_cuda SRCS target_wrapper.cc) nv_library(cuda_blas_lite SRCS blas.cc) + diff --git a/paddle/fluid/lite/gen_code/CMakeLists.txt b/paddle/fluid/lite/gen_code/CMakeLists.txt index bacfc3e988e6035dba696ac626da7a8072821b52..d6e447a2592856730136e8a80bd671ef52cd295c 100644 --- a/paddle/fluid/lite/gen_code/CMakeLists.txt +++ b/paddle/fluid/lite/gen_code/CMakeLists.txt @@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) DEPS scope_lite op_lite kernel_lite paddle_infer_gencode ) - lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__ - ${ops_lite} ${host_kernels} - X86_DEPS ${x86_kernels} - ) + # lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__ + # ${ops_lite} ${host_kernels} + # X86_DEPS ${x86_kernels} + # ) - add_dependencies(__generated_code__ test_gen_code_lite) + # add_dependencies(__generated_code__ test_gen_code_lite) endif() + diff --git a/paddle/fluid/lite/host/CMakeLists.txt b/paddle/fluid/lite/host/CMakeLists.txt index 90812f3f3cd712571eb7f11261e23c8dcb78b0fe..7f7cf8b238f99fa9db5569952f9e0e39a8ef9f37 100644 --- a/paddle/fluid/lite/host/CMakeLists.txt +++ b/paddle/fluid/lite/host/CMakeLists.txt @@ -1 +1,2 @@ cc_library(target_wrapper_host SRCS target_wrapper.cc) + diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt index ce22ba1216664cdf539ee4f576016adc389622ca..0d2178382d99debe1775bd015701825b0a06133a 100644 --- a/paddle/fluid/lite/kernels/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(arm) add_subdirectory(cuda) add_subdirectory(x86) + diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt index ff3cab02ee8b7e88783b8c6c18c496bf674c7cfd..6e4d73ecc6f65c5a5a09178680afe8a6ec7f8445 100644 --- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt @@ -6,15 +6,24 @@ message(STATUS "compile with lite ARM kernels") cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps}) -cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3) +cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) +cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm) +cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm) +cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) +cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm) lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) +lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm) +lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm) lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm) +lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) +lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) +lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) set(arm_kernels fc_compute_arm @@ -22,6 +31,13 @@ set(arm_kernels mul_compute_arm scale_compute_arm softmax_compute_arm - elementwise_add_compute_arm) + conv_compute_arm + batch_norm_compute_arm + elementwise_add_compute_arm + pool_compute_arm + split_compute_arm + ) set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels") + + diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cb43dd5e0430092cb4e3edb13226ca30de61e4d --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h" +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void BatchNormCompute::PrepareForRun() { + auto& param = this->Param(); + auto x_dims = param.x->dims(); + bool global_stats = param.is_test || param.use_global_stats; + if (global_stats) { + int64_t channel_size = 0; + switch (param.data_layout) { + case DATALAYOUT(kNCHW): + channel_size = x_dims[1]; + break; + // case DATALAYOUT(kNHWC): + // channel_size = x_dims[x_dims.size() - 1]; + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param.data_layout); + break; + } + new_scale.Resize({channel_size}); + new_bias.Resize({channel_size}); + auto* scale_data = param.scale->mutable_data(); + auto* bias_data = param.bias->mutable_data(); + auto* mean_data = param.mean->mutable_data(); + auto* variance_data = param.variance->mutable_data(); + auto* new_scale_data = new_scale.mutable_data(); + auto* new_bias_data = new_bias.mutable_data(); + for (int c = 0; c < channel_size; c++) { + float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon)); + new_bias_data[c] = + bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; + new_scale_data[c] = inv_scale * scale_data[c]; + } + } +} + +void BatchNormCompute::Run() { + auto& param = this->Param(); + auto x_dims = param.x->dims(); + auto x_data = param.x->mutable_data(); + auto y_data = param.y->mutable_data(); + bool global_stats = param.is_test || param.use_global_stats; + if (global_stats) { + auto* new_scale_data = new_scale.mutable_data(); + auto* new_bias_data = new_bias.mutable_data(); + int64_t outer_size = 0; + int64_t channel_size = 0; + int64_t inner_size = 0; + switch (param.data_layout) { + case DATALAYOUT(kNCHW): + outer_size = x_dims[0]; + channel_size = x_dims[1]; + inner_size = x_dims.Slice(2, x_dims.size()).production(); + lite::arm::math::scale(x_data, y_data, outer_size, channel_size, + inner_size, new_scale_data, new_bias_data); + break; + // case DATALAYOUT(kNHWC): + // outer_size = x_dims.Slice(0, x_dims.size() - 1).production(); + // channel_size = x_dims[x_dims.size() - 1]; + // lite::arm::math::scale(x_data, y_data, outer_size, channel_size, + // new_scale_data, new_bias_data); + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param.data_layout); + break; + } + } else { + // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and + // saved_variance + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, + paddle::lite::kernels::arm::BatchNormCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute.h b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..cf3ad3accded0db9a95d0f0794c863b4f7b1cd8e --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class BatchNormCompute : public KernelLite { + public: + using param_t = operators::BatchNormParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~BatchNormCompute() = default; + + private: + Tensor new_scale; + Tensor new_bias; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3ca1a0b599b3448fe2dbed08fb37ccc9dae3450c --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h" +#include +#include +#include +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +template +void batch_norm_compute_ref(const operators::BatchNormParam& param) { + DDim x_dims = param.x->dims(); + auto x_data = param.x->mutable_data(); + auto scale_data = param.scale->mutable_data(); + auto bias_data = param.bias->mutable_data(); + auto mean_data = param.mean->mutable_data(); + auto variance_data = param.variance->mutable_data(); + auto y_data = param.y->mutable_data(); + float epsilon = param.epsilon; + float momentum = param.momentum; + DataLayoutType data_layout = param.data_layout; + + bool global_stats = param.is_test || param.use_global_stats; + if (global_stats) { + int64_t outer_size = 0; + int64_t channel_size = 0; + int64_t inner_size = 0; + switch (data_layout) { + case DATALAYOUT(kNCHW): + outer_size = x_dims[0]; + channel_size = x_dims[1]; + inner_size = x_dims.Slice(2, x_dims.size()).production(); + break; + // case DATALAYOUT(kNHWC): + // outer_size = x_dims.Slice(0, x_dims.size() - 1).production(); + // channel_size = x_dims[x_dims.size() - 1]; + // inner_size = 1; + // break; + default: + LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout); + break; + } + auto x_ptr = x_data; + auto y_ptr = y_data; + for (int o = 0; o < outer_size; o++) { + for (int c = 0; c < channel_size; c++) { + for (int i = 0; i < inner_size; i++) { + dtype norm_x = + (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); + *y_ptr = norm_x * scale_data[c] + bias_data[c]; + x_ptr++; + y_ptr++; + } + } + } + } else { + // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and + // saved_variance + } +} + +TEST(batch_norm_arm, retrive_op) { + auto batch_norm = + KernelRegistry::Global().Create( + "batch_norm"); + ASSERT_FALSE(batch_norm.empty()); + ASSERT_TRUE(batch_norm.front()); +} + +TEST(batch_norm_arm, init) { + BatchNormCompute batch_norm; + ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat)); + ASSERT_EQ(batch_norm.target(), TARGET(kARM)); +} + +TEST(batch_norm_arm, compute) { + DeviceInfo::Init(); + for (auto n : {1, 2}) { + for (auto c : {6, 32 /*, 128*/}) { + for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) { + for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) { + for (auto is_test : {/*false, */ true}) { + for (auto use_global_stats : {false, true}) { + for (auto epsilon : {1e-4f, 1e-5f}) { + for (auto momentum : {0.9f, 0.99f}) { + for (auto data_layout : + {DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) { + Tensor x; + Tensor scale; + Tensor bias; + Tensor mean; + Tensor variance; + Tensor y; + Tensor mean_out; + Tensor variance_out; + Tensor saved_mean; + Tensor saved_variance; + Tensor y_ref; + Tensor mean_out_ref; + Tensor variance_out_ref; + Tensor saved_mean_ref; + Tensor saved_variance_ref; + // set the dims of input, output, ref output tensors + std::vector in_out_shape; + switch (data_layout) { + case DATALAYOUT(kNCHW): + in_out_shape = {n, c, h, w}; + break; + // case DATALAYOUT(kNHWC): + // in_out_shape = {n, h, w, c}; + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(data_layout); + break; + } + x.Resize(in_out_shape); + scale.Resize({c}); + bias.Resize({c}); + mean.Resize({c}); + variance.Resize({c}); + y.Resize(in_out_shape); + mean_out.Resize({c}); + variance_out.Resize({c}); + saved_mean.Resize({c}); + saved_variance.Resize({c}); + y_ref.Resize(in_out_shape); + mean_out_ref.Resize({c}); + variance_out_ref.Resize({c}); + saved_mean_ref.Resize({c}); + saved_variance_ref.Resize({c}); + // initialize the data of input tensors + auto* x_data = x.mutable_data(); + auto* scale_data = scale.mutable_data(); + auto* bias_data = bias.mutable_data(); + auto* mean_data = mean.mutable_data(); + auto* variance_data = variance.mutable_data(); + auto* y_data = y.mutable_data(); + for (int i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i % 64); + } + for (int i = 0; i < scale.dims().production(); i++) { + scale_data[i] = static_cast(i) * 0.01f + 0.03f; + } + for (int i = 0; i < bias.dims().production(); i++) { + bias_data[i] = static_cast(i) * 0.065f + 0.1f; + } + for (int i = 0; i < mean.dims().production(); i++) { + mean_data[i] = static_cast(i) * 0.0565f; + } + for (int i = 0; i < variance.dims().production(); i++) { + variance_data[i] = static_cast(i) * 2.08f + 1.5f; + } + // prepare kernel params and run + BatchNormCompute batch_norm; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + batch_norm.SetContext(std::move(ctx)); + operators::BatchNormParam param; + param.x = &x; + param.scale = &scale; + param.bias = &bias; + param.mean = &mean; + param.variance = &variance; + param.is_test = is_test; + param.use_global_stats = use_global_stats; + param.epsilon = epsilon; + param.momentum = momentum; + param.data_layout = data_layout; + param.y = &y; + param.mean_out = &mean_out; + param.variance_out = &variance_out; + param.saved_mean = &saved_mean; + param.saved_variance = &saved_variance; + batch_norm.SetParam(param); + batch_norm.Launch(); + // invoking ref implementation and compare results + param.y = &y_ref; + param.mean_out = &mean_out_ref; + param.variance_out = &variance_out_ref; + param.saved_mean = &saved_mean_ref; + param.saved_variance = &saved_variance_ref; + batch_norm_compute_ref(param); + auto* y_ref_data = y_ref.mutable_data(); + for (int i = 0; i < y.dims().production(); i++) { + EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); + } + } + } + } + } + } + } + } + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.cc b/paddle/fluid/lite/kernels/arm/conv_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b464a5df0b0c33e76d2a31db183a515fea7a015 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/conv_compute.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void ConvCompute::PrepareForRun() { + auto& param = this->Param(); + auto x_dims = param.x->dims(); + auto w_dims = param.filter->dims(); + auto o_dims = param.output->dims(); + + auto& ctx = this->ctx_->template As(); + + int win = x_dims[3]; // nchw + int hin = x_dims[2]; + int ic = x_dims[1]; + int bs = x_dims[0]; + int ow = o_dims[3]; + int oh = o_dims[2]; + int oc = o_dims[1]; + int kh = w_dims[2]; // oihw + int kw = w_dims[3]; + int pad = param.paddings[0]; + int stride = param.strides[0]; + + const auto* i_data = param.x->data(); + const auto* w_data = param.filter->data(); + const auto* b_data = param.bias ? param.bias->data() : nullptr; + auto* o_data = param.output->mutable_data(); + + bool kps_equal = (param.paddings[0] == param.paddings[1]) && + (param.strides[0] == param.strides[1]) && (kw == kh); + bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); + bool flag_dw_3x3 = + (kw == 3 && (pad == 0 || pad == 1) && (stride == 1 || stride == 2)); + bool flag_dw_5x5 = + (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2); + bool flag_dw = flag_dw_3x3 || flag_dw_5x5; + + // select conv impl + if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + // dw conv impl + impl_ = new lite::arm::math::DepthwiseConv; + VLOG(3) << "invoking dw conv"; + } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal && + no_dilation) { + if (ic >= 32 && oc >= 32 && oh > 16 && ow > 16) { + // winograd conv impl + impl_ = new lite::arm::math::WinogradConv; + VLOG(3) << "invoking winograd conv"; + } else { + // direct conv impl + impl_ = new lite::arm::math::DirectConv; + VLOG(3) << "invoking direct conv"; + } + } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal && + no_dilation) { + // direct conv impl + impl_ = new lite::arm::math::DirectConv; + VLOG(3) << "invoking direct conv"; + } else { + impl_ = new lite::arm::math::GemmLikeConv; + VLOG(3) << "invoking gemm like conv"; + } + CHECK(this->impl_->create(param, &ctx)); +} + +void ConvCompute::Run() { + auto& param = this->Param(); + CHECK(impl_); + impl_->run(param); + // if (this->act_ != nullptr) { + // this->act_->run(outputs, outputs, param.activation_param); + // } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, + paddle::lite::kernels::arm::ConvCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, + paddle::lite::kernels::arm::ConvCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.h b/paddle/fluid/lite/kernels/arm/conv_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..21fabf8c3e8f7983a891265135c39b96aaf42e8d --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/conv_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/operators/conv_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class ConvCompute : public KernelLite { + public: + using param_t = operators::ConvParam; + + void PrepareForRun() override; + + void Run() override; + + ~ConvCompute() { + if (impl_ != nullptr) { + delete impl_; + } + } + + private: + lite::arm::math::ImplBase* impl_{ + nullptr}; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4d80265d7728fa0eeea97fd070a982a8888ec7e --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc @@ -0,0 +1,248 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/conv_compute.h" +#include +#include +#include +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +template +void conv_compute_ref(const operators::ConvParam& param) { + auto input = param.x; + auto filter = param.filter; + auto output = param.output; + DDim input_dims = param.x->dims(); + DDim filter_dims = param.filter->dims(); + DDim output_dims = param.output->dims(); + std::vector paddings = param.paddings; + std::vector strides = param.strides; + std::vector dilations = param.dilations; + int groups = param.groups; + + auto input_data = param.x->data(); + auto output_data = param.output->mutable_data(); + auto filter_data = param.filter->mutable_data(); + const float* bias_data = nullptr; + if (param.bias != nullptr) { + bias_data = param.bias->mutable_data(); + } + bool flag_bias = bias_data != nullptr; + bool flag_relu = false; // TODO(hong19860320) param.relu + + int num = input_dims[0]; + int chout = output_dims[1]; + int hout = output_dims[2]; + int wout = output_dims[3]; + + int chin = input_dims[1]; + int hin = input_dims[2]; + int win = input_dims[3]; + int out_c_group = chout / groups; + int in_c_group = chin / groups; + + int stride_h = strides[0]; + int stride_w = strides[1]; + int dilation_h = dilations[0]; + int dilation_w = dilations[1]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; + int kernel_h = filter_dims[2]; + int kernel_w = filter_dims[3]; + + for (int n = 0; n < num; ++n) { + for (int g = 0; g < groups; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < hout; ++oh) { + for (int ow = 0; ow < wout; ++ow) { + int out_idx = n * groups * out_c_group * hout * wout + + g * out_c_group * hout * wout + oc * hout * wout + + oh * wout + ow; + output_data[out_idx] = + flag_bias ? static_cast(bias_data[g * out_c_group + oc]) + : 0.f; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - padding_w + kw * (dilation_w); + int ih = oh * stride_h - padding_h + kh * (dilation_h); + if (iw < 0 || iw >= win) continue; + if (ih < 0 || ih >= hin) continue; + + int iidx = n * chin * hin * win + g * in_c_group * hin * win + + ic * hin * win + ih * win + iw; + int widx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + + output_data[out_idx] += + (dtype)input_data[iidx] * (dtype)filter_data[widx]; + } + } + } + if (flag_relu) { + output_data[out_idx] = + output_data[out_idx] > 0.f ? output_data[out_idx] : 0.f; + } + } + } + } + } + } +} + +TEST(conv_arm, retrive_op) { + auto conv = KernelRegistry::Global().Create( + "conv2d"); + ASSERT_FALSE(conv.empty()); + ASSERT_TRUE(conv.front()); +} + +TEST(conv_arm, init) { + ConvCompute conv; + ASSERT_EQ(conv.precision(), PRECISION(kFloat)); + ASSERT_EQ(conv.target(), TARGET(kARM)); +} + +TEST(conv_arm, compute) { + DeviceInfo::Init(); +#if 1 + for (auto n : {2}) { + for (auto ic : {6}) { + for (auto oc : {6}) { + for (auto ih : {9}) { + for (auto iw : {9}) { + for (auto flag_bias : {false, true}) { + for (auto flag_relu : {false, true}) { + for (auto depthwise : {false, true}) { + for (auto dilation : {1}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1, 2}) { + for (auto ks : {1, 3, 5}) { +#else + for (auto n : {1, 2}) { + for (auto ic : {6, 32 /*, 128*/}) { + for (auto oc : {6, 32 /*, 128*/}) { + for (auto ih : {9, 18 /*, 56 , 112, 224, 512*/}) { + for (auto iw : {9, 18 /*, 56, 112, 224, 512*/}) { + for (auto flag_bias : {false, true}) { + for (auto flag_relu : {false, true}) { + for (auto depthwise : {false, true}) { + for (auto dilation : {1, 2}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1, 2}) { + for (auto ks : {1, 3, 5}) { +#endif + int group = 1; + if (depthwise) { // depthwise convolution ? + group = oc = ic; + } + // get input, filter and output shape + std::vector input_shape = {n, ic, ih, iw}; + std::vector filter_shape = {oc, ic / group, + ks, ks}; + const int dks = dilation * (ks - 1) + 1; + int oh = (ih + 2 * padding - dks) / stride + 1; + int ow = (iw + 2 * padding - dks) / stride + 1; + std::vector output_shape({n, oc, oh, ow}); + // resize input, filter and output + Tensor input; + Tensor filter; + Tensor bias; + Tensor output; + Tensor output_ref; + input.Resize(input_shape); + filter.Resize(filter_shape); + output.Resize(output_shape); + output_ref.Resize(output_shape); + VLOG(3) << "input: " << input.dims(); + VLOG(3) << "filter: " << filter.dims() + << " padding:" << padding + << " stride:" << stride + << " dilation:" << dilation; + VLOG(3) << "output: " << output.dims(); + auto* input_data = input.mutable_data(); + auto* filter_data = filter.mutable_data(); + auto* output_data = output.mutable_data(); + for (int i = 0; i < input.dims().production(); i++) { + input_data[i] = static_cast(i % 128); + } + for (int i = 0; i < filter.dims().production(); i++) { + filter_data[i] = + i * 0.001f / + static_cast(filter.dims().production()); + } + // prepare kernel params and run + ConvCompute conv; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + conv.SetContext(std::move(ctx)); + operators::ConvParam param; + param.x = &input; + param.filter = &filter; + param.output = &output; + param.bias = nullptr; + if (flag_bias) { + bias.Resize({oc}); + auto* bias_data = bias.mutable_data(); + for (int i = 0; i < bias.dims().production(); i++) { + bias_data[i] = static_cast(i); + } + param.bias = &bias; + } + // TODO(hong19860320) param.relu = flag_relu; + param.paddings = std::vector({padding, padding}); + param.strides = std::vector({stride, stride}); + param.dilations = + std::vector({dilation, dilation}); + param.groups = group; + conv.SetParam(param); + conv.Launch(); + // invoking ref implementation and compare results + param.output = &output_ref; + conv_compute_ref(param); + auto* output_ref_data = + output_ref.mutable_data(); + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], + 1e-3); + } + } + } + } + } + } + } + } + } + } + } + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc index b26551e0533a5ae68c930cc1b9512ba0ca13253a..efd98008e7324eb1f884d1b1cad20b3ed1b0419e 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.cc +++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc @@ -22,6 +22,10 @@ namespace lite { namespace kernels { namespace arm { +void FcCompute::PrepareForRun() { + // TODO(TJ): transpose weight +} + void FcCompute::Run() { auto& param = this->Param(); auto x_dims = param.input->dims(); @@ -48,22 +52,16 @@ void FcCompute::Run() { &ctx); lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n, x_w, false, false, false, &ctx); - if (param.bias) { CHECK_EQ(param.bias->numel(), n); lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n); } } else { - // use sgemmv - // sgemv((const float*)weights, (const float*)din, (float*)dout, - // false, n, x_w, _param->_flag_bias, (float*)bias, false); + lite::arm::math::sgemv(w_data, i_data, o_data, false, n, x_w, + b_data != nullptr, b_data, false); } } -TargetType FcCompute::target() const { return TARGET(kARM); } - -PrecisionType FcCompute::precision() const { return PRECISION(kFloat); } - } // namespace arm } // namespace kernels } // namespace lite diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.h b/paddle/fluid/lite/kernels/arm/fc_compute.h index 414517843354f638ed37f54ef596dc6db53193ce..459d23194d8c50f593ebc92da2d5342fb449d110 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.h +++ b/paddle/fluid/lite/kernels/arm/fc_compute.h @@ -25,10 +25,9 @@ class FcCompute : public KernelLite { public: using param_t = operators::FcParam; - void Run() override; + void PrepareForRun() override; - TargetType target() const override; - PrecisionType precision() const override; + void Run() override; virtual ~FcCompute() = default; }; diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.cc b/paddle/fluid/lite/kernels/arm/mul_compute.cc index ff12b236031896cfd8503903327ab1141b5171ae..269e4842252c2a88f33c8faf6666d139e36e49f3 100644 --- a/paddle/fluid/lite/kernels/arm/mul_compute.cc +++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc @@ -12,57 +12,57 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/kernels/arm/mul_compute.h" +#include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/core/op_registry.h" -#include "paddle/fluid/lite/core/types.h" +#include "paddle/fluid/lite/core/type_system.h" namespace paddle { namespace lite { namespace kernels { namespace arm { -template -void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h, - int y_w, T* out) { - using matrix_t = - Eigen::Matrix; +void MulCompute::PrepareForRun() { + // TODO(TJ): transpose x or y if necessary +} - Eigen::Map X(x, x_h, x_w); - Eigen::Map Y(y, y_h, y_w); - Eigen::Map Out(out, x_h, y_w); +void MulCompute::Run() { + auto& param = Param(); - Out = X * Y; -} + const auto* x_data = param.x->data(); + const auto* y_data = param.y->data(); + auto* o_data = param.output->mutable_data(); -class MulCompute : public KernelLite { - public: - using param_t = operators::MulParam; + int m = static_cast( + param.x->dims().Slice(0, param.x_num_col_dims).production()); + int x_w = + static_cast(param.x->dims() + .Slice(param.x_num_col_dims, param.x->dims().size()) + .production()); + int y_h = static_cast( + param.y->dims().Slice(0, param.y_num_col_dims).production()); + int n = + static_cast(param.y->dims() + .Slice(param.y_num_col_dims, param.y->dims().size()) + .production()); - void Run() override { - auto& param = Param(); - core::dim2 x_shape( - {static_cast( - param.x->dims().Slice(0, param.x_num_col_dims).production()), - static_cast( - param.x->dims() - .Slice(param.x_num_col_dims, param.x->dims().size()) - .production())}); - core::dim2 y_shape( - {static_cast( - param.y->dims().Slice(0, param.y_num_col_dims).production()), - static_cast( - param.y->dims() - .Slice(param.y_num_col_dims, param.y->dims().size()) - .production())}); + CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; + auto k = x_w; + if (n == 1) { + lite::arm::math::sgemv(x_data, y_data, o_data, false, m, k, false, nullptr, + false); - mul_compute_eigen(param.x->data(), x_shape.x, x_shape.y, // - param.y->data(), y_shape.x, y_shape.y, // - param.output->mutable_data()); - } + } else { + constexpr bool is_tranposed_y = false; + auto& ctx = this->ctx_->template As(); - virtual ~MulCompute() = default; -}; + float* packed_x = static_cast(ctx.workspace_data()) + + ctx.l2_cache_size() / sizeof(float); + lite::arm::math::prepackA(packed_x, x_data, k, 0, m, 0, k, false, &ctx); + lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m, n, k, + false, false, is_tranposed_y, &ctx); + } +} } // namespace arm } // namespace kernels diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.h b/paddle/fluid/lite/kernels/arm/mul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c18995e5a5c3cceb749465382b284c0a52c188a4 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/mul_compute.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class MulCompute : public KernelLite { + public: + using param_t = operators::MulParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~MulCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/mul_compute_test.cc b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e3d17ec93ae9d73028343b3d4dd1e77a0fe86f0 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/mul_compute.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +template +void FillData(T* a, const int n, const T lower = static_cast(-2.f), + const T upper = static_cast(2.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +TEST(mul_arm, retrive_op) { + auto mul = + KernelRegistry::Global().Create("mul"); + ASSERT_FALSE(mul.empty()); + ASSERT_TRUE(mul.front()); +} + +TEST(mul_arm, init) { + MulCompute mul; + ASSERT_EQ(mul.precision(), PRECISION(kFloat)); + ASSERT_EQ(mul.target(), TARGET(kARM)); +} + +TEST(mul_arm, compare_test) { + using T = float; + + for (int m : {1, 2, 3, 4}) { + for (int n : {1, 2, 3, 4}) { + for (int k : {1, 2, 3, 4}) { + VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k; + lite::Tensor x, y, out, ref; + x.Resize({m, k}); + y.Resize({k, n}); + out.Resize({m, n}); + ref.Resize({m, n}); + + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_data = out.mutable_data(); + auto* ref_data = ref.mutable_data(); + + FillData(x_data, x.dims().production()); + FillData(y_data, y.dims().production()); + FillData(out_data, out.dims().production(), 0, 0); + FillData(ref_data, ref.dims().production(), 0, 0); + + MulCompute mul; + operators::MulParam param; + + param.x = &x; + param.y = &y; + param.output = &out; + + DeviceInfo::Init(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + mul.SetParam(param); + mul.SetContext(std::move(ctx)); + mul.PrepareForRun(); + + mul.Run(); + + lite::arm::math::mul_compute_eigen(x_data, m, k, y_data, k, n, + ref_data); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-3); + } + } + } + } +} + +TEST(mul_arm, num_col_dims) { + using T = float; + + lite::Tensor x, y, out, ref; + x.Resize({2, 3, 4}); + y.Resize({3, 4, 5}); + out.Resize({2, 5}); + ref.Resize({2, 5}); + + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_data = out.mutable_data(); + auto* ref_data = ref.mutable_data(); + + FillData(x_data, x.dims().production()); + FillData(y_data, y.dims().production()); + FillData(out_data, out.dims().production()); + FillData(ref_data, out.dims().production()); + + MulCompute mul; + operators::MulParam param; + + param.x = &x; + param.y = &y; + param.output = &out; + param.x_num_col_dims = 1; + param.y_num_col_dims = 2; + + DeviceInfo::Init(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + mul.SetParam(param); + mul.SetContext(std::move(ctx)); + mul.PrepareForRun(); + + mul.Run(); + + lite::arm::math::mul_compute_eigen(x_data, 2, 12, y_data, 12, 5, ref_data); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-3); + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.cc b/paddle/fluid/lite/kernels/arm/pool_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a7716fae6bfc3aa52dad7c8b8192191e986b6f3 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/pool_compute.h" +#include +#include +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void PoolCompute::Run() { + auto& param = Param(); + auto& in_dims = param.x->dims(); + auto& out_dims = param.output->dims(); + + const float* din = param.x->data(); + float* dout = param.output->mutable_data(); + + std::vector& ksize = param.ksize; + std::vector& strides = param.strides; + std::vector& paddings = param.paddings; + + std::string& pooling_type = param.pooling_type; + bool global_pooling = param.global_pooling; + bool exclusive = param.exclusive; + bool adaptive = param.adaptive; + bool ceil_mode = param.ceil_mode; + bool use_quantizer = param.use_quantizer; + std::string& data_format = param.data_format; + + if (param.global_pooling) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_dims[i + 2]); + } + } + +#if 0 + for (int i = 0; i < in_dims.size(); ++i) { + LOG(INFO) << "in_dims[" << i << "]:" << in_dims[i]; + } + for (int i = 0; i < out_dims.size(); ++i) { + LOG(INFO) << "out_dims[" << i << "]:" << out_dims[i]; + } + for (int i = 0; i < ksize.size(); ++i) { + LOG(INFO) << "ksize[" << i << "]:" << ksize[i]; + } + for (int i = 0; i < strides.size(); ++i) { + LOG(INFO) << "strides[" << i << "]:" << strides[i]; + } + for (int i = 0; i < paddings.size(); ++i) { + LOG(INFO) << "paddings[" << i << "]:" << paddings[i]; + } + LOG(INFO) << "global_pooling:" << global_pooling; + LOG(INFO) << "exclusive:" << exclusive; + LOG(INFO) << "adaptive:" << adaptive; + LOG(INFO) << "ceil_mode:" << ceil_mode; + LOG(INFO) << "use_quantizer:" << use_quantizer; + LOG(INFO) << "data_format:" << data_format; + LOG(INFO) << "din:" << din; + LOG(INFO) << "dout:" << dout; +#endif + + // global + if (global_pooling == true) { + lite::arm::math::pooling_global( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 && + strides[0] == strides[1]) { + if (pooling_type == "max") { + lite::arm::math::pooling2x2s2_max( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } else if (pooling_type == "avg") { + lite::arm::math::pooling2x2s2_ave( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } + } else if (ksize[0] == 3 && ksize[0] == ksize[1] && strides[0] == 1 && + strides[0] == strides[1] && paddings[0] == 1) { + if (pooling_type == "max") { + lite::arm::math::pooling3x3s1p1_max( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } else if (pooling_type == "avg") { + lite::arm::math::pooling3x3s1p1_ave( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } + } else if (ksize[0] == 3 && ksize[0] == ksize[1] && strides[0] == 2 && + strides[0] == strides[1] && paddings[0] == 0) { + if (pooling_type == "max") { + lite::arm::math::pooling3x3s2p0_max( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } else if (pooling_type == "avg") { + lite::arm::math::pooling3x3s2p0_ave( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } + } else if (ksize[0] == 3 && ksize[0] == ksize[1] && strides[0] == 2 && + strides[0] == strides[1] && paddings[0] == 1) { + if (pooling_type == "max") { + lite::arm::math::pooling3x3s2p1_max( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } else if (pooling_type == "avg") { + lite::arm::math::pooling3x3s2p1_ave( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } + } else { + lite::arm::math::pooling_basic( + din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], + in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings, + global_pooling, exclusive, adaptive, ceil_mode, use_quantizer, + pooling_type); + } + return; +} + +TargetType PoolCompute::target() const { return TARGET(kARM); } + +PrecisionType PoolCompute::precision() const { return PRECISION(kFloat); } + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(pool, kARM, kFloat, kNCHW, + paddle::lite::kernels::arm::PoolCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.h b/paddle/fluid/lite/kernels/arm/pool_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..76dedbc3132405cd70d74e233619572f97dc07e0 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/pool_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/operators/pool_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class PoolCompute : public KernelLite { + public: + using param_t = operators::PoolParam; + + void Run() override; + + TargetType target() const override; + PrecisionType precision() const override; + + virtual ~PoolCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b024ccef9d526d56bcf52c1600940ff0804eaf1f --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc @@ -0,0 +1,275 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/pool_compute.h" +#include +#include +#include +#include +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void pool_compute_ref(const operators::PoolParam& param) { + auto& in_dims = param.x->dims(); + auto& out_dims = param.output->dims(); + + const float* src_ptr = param.x->data(); + float* dst_ptr = param.output->mutable_data(); + + std::vector ksize = param.ksize; + std::vector strides = param.strides; + std::vector paddings = param.paddings; + + std::string pooling_type = param.pooling_type; + bool global_pooling = param.global_pooling; + bool exclusive = param.exclusive; + bool adaptive = param.adaptive; + bool ceil_mode = param.ceil_mode; + bool use_quantizer = param.use_quantizer; + std::string data_format = param.data_format; + + int in_n = in_dims[0]; + int in_c = in_dims[1]; + int in_h = in_dims[2]; + int in_w = in_dims[3]; + int size_in_n = in_c * in_h * in_w; + int size_in_c = in_h * in_w; + + int out_h = out_dims[2]; + int out_w = out_dims[3]; + int size_out_n = in_c * out_h * out_w; + int size_out_c = out_h * out_w; + + int window_h = ksize[0]; + int window_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + + if (global_pooling == true) { + ksize[0] = in_h; + ksize[1] = in_w; + } + +#if 0 + for (int i = 0; i < ksize.size(); ++i) { + LOG(INFO) << "ksize[" << i << "]:" << ksize[i]; + } + for (int i = 0; i < strides.size(); ++i) { + LOG(INFO) << "strides[" << i << "]:" << strides[i]; + } + for (int i = 0; i < paddings.size(); ++i) { + LOG(INFO) << "paddings[" << i << "]:" << paddings[i]; + } + LOG(INFO) << "in nchw:" << in_n << ", " << in_c << ", " << in_h << ", " + << in_w; + LOG(INFO) << "size_in_n:" << size_in_n; + LOG(INFO) << "size_out_c:" << size_out_c; + LOG(INFO) << "out_h:" << out_h; + LOG(INFO) << "out_w:" << out_w; + LOG(INFO) << "size_out_n:" << size_out_n; + LOG(INFO) << "size_out_c:" << size_out_c; + LOG(INFO) << "window_h:" << window_h; + LOG(INFO) << "window_w:" << window_w; + LOG(INFO) << "stride_h:" << stride_h; + LOG(INFO) << "stride_w:" << stride_w; + LOG(INFO) << "pad_h:" << pad_h; + LOG(INFO) << "pad_w:" << pad_w; +#endif + + for (int ind_n = 0; ind_n < in_n; ++ind_n) { + for (int ind_c = 0; ind_c < in_c; ++ind_c) { + for (int ind_h = 0; ind_h < out_h; ++ind_h) { + int sh = ind_h * stride_h; + int eh = sh + window_h; + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; + + for (int ind_w = 0; ind_w < out_w; ++ind_w) { + int sw = ind_w * stride_w; + int ew = sw + window_w; + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; + + float result = static_cast(0); + + int dst_ind = + ind_n * size_out_n + ind_c * size_out_c + ind_h * out_w + ind_w; + + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_ind = + ind_n * size_in_n + ind_c * size_in_c + kh * in_w + kw; + + if (kh == sh && kw == sw) { + result = src_ptr[src_ind]; + } else { + if (pooling_type == "max") { + result = + result >= src_ptr[src_ind] ? result : src_ptr[src_ind]; + } + if (pooling_type == "avg" && exclusive == false) { + // Pooling_average_include_padding + result += src_ptr[src_ind]; + } + if (pooling_type == "avg" && exclusive == true) { + // Pooling_average_include_padding + result += src_ptr[src_ind]; + } + } + } + } + if (pooling_type == "avg" && exclusive == false) { + // Pooling_average_include_padding + // result /= param.window_h * param.window_w; + // LOG(ERROR)<<"cpu"<= in_w + pad_w ? in_w + pad_w : sw + window_w; + bw -= sw; + } + if (eh == in_h) { + bh = sh + window_h >= in_h + pad_h ? in_h + pad_h : sh + window_h; + bh -= sh; + } + result /= bh * bw; + } + if (pooling_type == "avg" && exclusive == true) { + // Pooling_average_exclude_padding + result /= (ew - sw) * (eh - sh); + } + dst_ptr[dst_ind] = result; + } + } + } + } +} + +TEST(pool_arm, init) { + PoolCompute pool; + ASSERT_EQ(pool.precision(), PRECISION(kFloat)); + ASSERT_EQ(pool.target(), TARGET(kARM)); +} + +TEST(pool_arm, compute) { + PoolCompute pool; + operators::PoolParam param; + + lite::Tensor x; + lite::Tensor output; + lite::Tensor output_ref; + + for (auto pooling_type : {"avg", "max"}) { + for (auto global_pooling : {true}) { + for (auto stride : {2}) { + for (auto pad : {0}) { + for (auto n : {1, 3, 4, 11}) { + for (auto c : {1, 3, 11 /* ,1024 */}) { // speedup for ci + for (auto h : {3, 1, 11, 4, 1}) { + for (auto w : {1, 3, 4, 12, 1}) { + VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w + << " stride:" << stride << " pad:" << pad + << " pooling_type:" << pooling_type + << " global_pooling:" << global_pooling; + + // init x, output + x.Resize(DDim(std::vector({n, c, h, w}))); + output.Resize(DDim(std::vector({n, c, 1, 1}))); + output_ref.Resize(DDim(std::vector({n, c, 1, 1}))); + auto* x_data = x.mutable_data(); + for (int i = 0; i < x.dims().production(); ++i) { + x_data[i] = i; + } + + // fill param + param.x = &x; + param.output = &output; + param.pooling_type = pooling_type; + param.ksize = {h, w}; + param.global_pooling = global_pooling; + param.strides = {stride, stride}; + param.paddings = {pad, pad}; + param.exclusive = true; + param.adaptive = false; + param.ceil_mode = false; + param.use_quantizer = false; + + // compute + pool.SetParam(param); + pool.Run(); + +#if 0 + LOG(INFO) << "n:" << n << " c:" << c << " h:" << h << " w:" << w + << " end"; + std::cout << "n:" << n << " c:" << c << " h:" << h << " w:" << w + << " end" << std::endl; + for (int i = 0; i < param.ksize.size(); ++i) { + std::cout << " ksize[" << i << "]:" << param.ksize[i]; + } + std::cout << "\n"; + for (int i = 0; i < param.strides.size(); ++i) { + std::cout << " strides[" << i << "]:" << param.strides[i]; + } + std::cout << "\n"; + for (int i = 0; i < param.paddings.size(); ++i) { + std::cout << " paddings[" << i << "]:" << param.paddings[i]; + } + std::cout << "\n"; +#endif + + // compute ref + // output_ref.Resize(output.dims()); + param.output = &output_ref; + pool_compute_ref(param); + VLOG(3) << "pool_compute_ref(param) end"; + + // compare + auto* output_data = output.mutable_data(); + auto* output_ref_data = output_ref.mutable_data(); + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], + 1); // 1e-5); + } + + VLOG(3) << "compare pass"; + } + } + } + } + } // pad + } // stride + } // global_pooling + } // pooling_type +} + +TEST(pool, retrive_op) { + auto pool = + KernelRegistry::Global().Create("pool"); + ASSERT_FALSE(pool.empty()); + ASSERT_TRUE(pool.front()); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc index fee47d7eb7a6c093524bb0af617c60d069add01a..b1277792286429b666b3479c0655bb211a69db30 100644 --- a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc +++ b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc @@ -54,6 +54,15 @@ TEST(scale_arm, compute) { lite::Tensor output; lite::Tensor output_ref; +#if 1 // for ci speedup + for (auto n : {1, 3}) { + for (auto c : {1, 3}) { + for (auto h : {3, 4}) { + for (auto w : {4, 3}) { + for (auto bias_after_scale : {true, false}) { + for (auto s : {-1.0f, 0.13f}) { + for (auto b : {-15.f, 0.11234f}) { +#else for (auto n : {1, 3, 4, 11}) { for (auto c : {1, 3, 11, 4}) { for (auto h : {3, 1, 11, 4}) { @@ -61,6 +70,8 @@ TEST(scale_arm, compute) { for (auto bias_after_scale : {true, false}) { for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) { for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) { +#endif + x.Resize(DDim(std::vector({n, c, h, w}))); output.Resize(DDim(std::vector({n, c, h, w}))); output_ref.Resize(DDim(std::vector({n, c, h, w}))); diff --git a/paddle/fluid/lite/kernels/arm/split_compute.cc b/paddle/fluid/lite/kernels/arm/split_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c2416bd6907199e6e83baf65c428b675462f271 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/split_compute.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/split_compute.h" +#include +#include "paddle/fluid/lite/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void SplitCompute::Run() { + auto& param = Param(); + const float* din = param.x->data(); + auto& dout = param.output; + auto in_dim = param.x->dims(); + std::vector in_strides(in_dim.size()); + in_strides[in_dim.size() - 1] = in_dim[in_dim.size() - 1]; + for (int i = in_dim.size() - 2; i >= 0; --i) { + in_strides[i] = in_strides[i + 1] * in_dim[i]; + } + lite::arm::math::split(din, dout, param.axis, in_strides); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(split, kARM, kFloat, kNCHW, + paddle::lite::kernels::arm::SplitCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/split_compute.h b/paddle/fluid/lite/kernels/arm/split_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..22701ba0fd9a77149939933c2e9fcc0c9295e3a1 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/split_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class SplitCompute : public KernelLite { + public: + void Run() override; + + virtual ~SplitCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/split_compute_test.cc b/paddle/fluid/lite/kernels/arm/split_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..39632bee8decfe875f0adb3c2717d58e593c400b --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/split_compute_test.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/split_compute.h" +#include +#include +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void splite_resize_out(const lite::Tensor* din, + const std::vector& dout, int axis, + int num, const std::vector& sections) { + auto in_dims = din->dims(); + int outs_number = dout.size(); + + std::vector outs_dims; + outs_dims.reserve(outs_number); + + if (num > 0) { + int out_axis_dim = in_dims[axis] / num; + for (int i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (sections.size() > 0) { + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = sections[i]; + outs_dims.push_back(dim); + } + } + + for (int j = 0; j < outs_dims.size(); ++j) { + dout[j]->Resize(outs_dims[j]); + } +} + +template +void split_compute_ref(const operators::SplitParam& param) { + const dtype* din = param.x->mutable_data(); + auto& dout = param.output; + auto in_dim = param.x->dims(); + int axis = param.axis; + std::vector in_strides(in_dim.size()); + in_strides[in_dim.size() - 1] = in_dim[in_dim.size() - 1]; + for (int i = in_dim.size() - 2; i >= 0; --i) { + in_strides[i] = in_strides[i + 1] * in_dim[i]; + } + + int input_offset = 0; + for (auto out : dout) { + auto out_dim = out->dims(); + std::vector out_strides(out_dim.size()); + out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1]; + for (int i = out_dim.size() - 2; i >= 0; --i) { + out_strides[i] = out_strides[i + 1] * out_dim[i]; + } + + dtype* out_data = out->mutable_data(); + int before = out_strides[0] / out_strides[axis]; + int in_after = in_strides[axis]; + int out_after = out_strides[axis]; + + for (int i = 0; i < before; ++i) { + std::memcpy(out_data + i * out_after, din + input_offset + i * in_after, + sizeof(dtype) * out_after); + } + input_offset += out_strides[axis]; + } +} + +TEST(split_arm, init) { + SplitCompute split; + ASSERT_EQ(split.precision(), PRECISION(kFloat)); + ASSERT_EQ(split.target(), TARGET(kARM)); +} + +TEST(split_arm, compute) { + SplitCompute split; + operators::SplitParam param; + + lite::Tensor x; + std::vector output; + std::vector output_ref; + + for (auto n : {1, 3, 4}) { + for (auto c : {1, 3, 4}) { + for (auto h : {1, 3, 4}) { + for (auto w : {1, 3, 4}) { + for (auto axis : {0, 1, 2, 3}) { + for (auto num : {0, 1, 2, 3}) { + for (auto sections : + {std::vector{1, 1, 1}, std::vector{2, 2}, + std::vector{1, 2}}) { + auto x_dim = DDim(std::vector({n, c, h, w})); + x.Resize(x_dim); + if ((num != 0 && x_dim[axis] % num != 0) || + (num == 0 && x_dim[axis] % sections.size() != 0)) + continue; + auto* x_data = x.mutable_data(); + for (int i = 0; i < x.dims().production(); i++) { + x_data[i] = i; + } + for (auto out : output) delete out; + for (auto out : output_ref) delete out; + output.clear(); + output_ref.clear(); + + int outs_number; + if (num > 0) { + outs_number = num; + } else { + outs_number = sections.size(); + } + for (int i = 0; i < outs_number; i++) { + output.push_back(new lite::Tensor); + output_ref.push_back(new lite::Tensor); + } + splite_resize_out(&x, output, axis, num, sections); + splite_resize_out(&x, output_ref, axis, num, sections); + param.x = &x; + param.axis = axis; + param.num = num; + param.sections = sections; + param.output = output; + split.SetParam(param); + split.Run(); + param.output = output_ref; + split_compute_ref(param); + for (int i = 0; i < output.size(); i++) { + float* output_data = output[i]->mutable_data(); + float* output_ref_data = output_ref[i]->mutable_data(); + for (int j = 0; j < output[i]->dims().production(); j++) { + EXPECT_NEAR(output_data[j], output_ref_data[j], 1e-5); + } + } + } + } + } + } + } + } + } +} + +TEST(split, retrive_op) { + auto split = + KernelRegistry::Global().Create("split"); + ASSERT_FALSE(split.empty()); + ASSERT_TRUE(split.front()); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/use_kernels.h b/paddle/fluid/lite/kernels/arm/use_kernels.h index d856950f3a177d08cdc950c259abf3d1a194ee25..1f93a81aa94f09f8330aa385840adec559d7161d 100644 --- a/paddle/fluid/lite/kernels/arm/use_kernels.h +++ b/paddle/fluid/lite/kernels/arm/use_kernels.h @@ -19,5 +19,6 @@ USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(feed, kARM, kAny, kAny, def); USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def); diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt index f35f634a217fabd539c9b124c44bc6cdeb186dd6..b7a48946257cb03e311949dd0aa51e31ad239eca 100644 --- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt @@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite}) nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite) + diff --git a/paddle/fluid/lite/kernels/host/CMakeLists.txt b/paddle/fluid/lite/kernels/host/CMakeLists.txt index a71a8e13ab8fe1667dc7d0dc8477d58182d5139f..7e8e6bcb6db82c570885b32aeed8542ed10209a5 100644 --- a/paddle/fluid/lite/kernels/host/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt @@ -13,3 +13,4 @@ set(host_kernels ) set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels") + diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt index 3747351d5626b9cb5e0e5afda6b01e6d7a464ad5..c2845fb9b21b2e4d0bb7ff378676d4531212db52 100644 --- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt @@ -35,3 +35,4 @@ set(x86_kernels ) set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") + diff --git a/paddle/fluid/lite/model_parser/CMakeLists.txt b/paddle/fluid/lite/model_parser/CMakeLists.txt index 63fe21abdafb916be72fddb99023d6ba4b8530c0..d179e0350ac0edd89912377cc668c6b8888c2638 100644 --- a/paddle/fluid/lite/model_parser/CMakeLists.txt +++ b/paddle/fluid/lite/model_parser/CMakeLists.txt @@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des add_subdirectory(pb) add_subdirectory(cpp) + diff --git a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt index 71073179991294aadef40d5df6d23662ec41fcfe..e6e2fc77f00c691176aa5c20c455964bd9bd5e66 100644 --- a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt +++ b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt @@ -1 +1,2 @@ cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite) + diff --git a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt index 22d88aeabf479e9c234cfa1e9660a6d2af9439b4..6910542f2a17f1ec5cdbe5f77203197ae3d57b89 100644 --- a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt +++ b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt @@ -1,2 +1,3 @@ cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite) cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite) + diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt index ed26f5fdb1f8cec9780c686cd2b73a6699170120..ac3dc1285e4ef8f7b6caba63884ec9966957613a 100644 --- a/paddle/fluid/lite/operators/CMakeLists.txt +++ b/paddle/fluid/lite/operators/CMakeLists.txt @@ -1,11 +1,14 @@ set(op_DEPS ${tensor_lite} op_lite op_params_lite) +cc_library(conv_op_lite SRCS conv_op.cc DEPS ${op_DEPS}) +cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS}) cc_library(fc_op_lite SRCS fc_op.cc DEPS ${op_DEPS}) cc_library(relu_op_lite SRCS relu_op.cc DEPS ${op_DEPS}) cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS}) cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS}) cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS}) cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} ) +cc_library(batch_norm_op_lite SRCS batch_norm_op.cc DEPS ${op_DEPS}) cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS}) cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS}) cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS}) @@ -17,16 +20,18 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS}) cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite) cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS}) cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS}) -cc_library(conv_op_lite SRCS conv_op.cc DEPS ${op_DEPS}) -cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS}) +cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS}) set(ops_lite + conv_op_lite + pool_op_lite fc_op_lite relu_op_lite mul_op_lite scale_op_lite softmax_op_lite reshape_op_lite + batch_norm_op_lite feed_op_lite fetch_op_lite io_copy_op_lite @@ -36,15 +41,19 @@ set(ops_lite activation_ops_lite dropout_op_lite concat_op_lite - conv_op_lite - pool_op_lite + split_op_lite PARENT_SCOPE) lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite memory_lite X86_DEPS fc_compute_x86 - ARM_DEPS fc_compute_arm) + ARM_DEPS fc_compute_arm) +lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc + DEPS pool_op_lite memory_lite + ARM_DEPS pool_compute_arm) lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite) lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite) lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite) +lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite) lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite) + diff --git a/paddle/fluid/lite/operators/batch_norm_op.cc b/paddle/fluid/lite/operators/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e974d0134dad93a2241c265687a190b10d5ff85d --- /dev/null +++ b/paddle/fluid/lite/operators/batch_norm_op.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/batch_norm_op.h" +#include "paddle/fluid/lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool BatchNormOp::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.bias); + CHECK_OR_FALSE(param_.scale); + CHECK_OR_FALSE(param_.mean); + CHECK_OR_FALSE(param_.variance); + CHECK_OR_FALSE(param_.y); + if (!param_.is_test) { + CHECK_OR_FALSE(param_.mean_out); + CHECK_OR_FALSE(param_.variance_out); + CHECK_OR_FALSE(param_.saved_mean); + CHECK_OR_FALSE(param_.saved_variance); + } + auto x_dims = param_.x->dims(); + auto scale_dims = param_.scale->dims(); + auto bias_dims = param_.bias->dims(); + auto mean_dims = param_.mean->dims(); + auto variance_dims = param_.variance->dims(); + CHECK(x_dims.size() >= 2 && x_dims.size() <= 5) + << "Input X must have 2 to 5 dimensions."; + CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions."; + CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions."; + CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions."; + CHECK_EQ(variance_dims.size(), 1UL) + << "Input Variance must have 1 dimensions."; + return true; +} + +bool BatchNormOp::InferShape() const { + auto x_dims = param_.x->dims(); + int64_t channel_size = 0; + switch (param_.data_layout) { + case DATALAYOUT(kNCHW): + channel_size = x_dims[1]; + break; + // case DATALAYOUT(kNHWC): + // channel_size = x_dims[x_dims.size() - 1]; + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param_.data_layout); + break; + } + if (!param_.is_test) { + param_.mean_out->Resize({channel_size}); + param_.variance_out->Resize({channel_size}); + param_.saved_mean->Resize({channel_size}); + param_.saved_variance->Resize({channel_size}); + } + param_.y->Resize(x_dims); + return true; +} + +bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable(); + param_.bias = + scope->FindVar(op_desc.Input("Bias").front())->GetMutable(); + param_.scale = + scope->FindVar(op_desc.Input("Scale").front())->GetMutable(); + param_.mean = + scope->FindVar(op_desc.Input("Mean").front())->GetMutable(); + param_.variance = + scope->FindVar(op_desc.Input("Variance").front())->GetMutable(); + param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable(); + param_.is_test = op_desc.GetAttr("is_test"); + param_.use_global_stats = op_desc.GetAttr("use_global_stats"); + if (!param_.is_test) { + param_.mean_out = + scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable(); + param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front()) + ->GetMutable(); + param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front()) + ->GetMutable(); + param_.saved_variance = + scope->FindVar(op_desc.Output("SavedVariance").front()) + ->GetMutable(); + } + param_.epsilon = op_desc.GetAttr("epsilon"); + param_.momentum = op_desc.GetAttr("momentum"); + std::string data_layout = op_desc.GetAttr("data_layout"); + CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW."; + // param_.data_layout = StringToDataLayout(data_layout); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp); diff --git a/paddle/fluid/lite/operators/batch_norm_op.h b/paddle/fluid/lite/operators/batch_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..30e8747319b1575b0c63e4b2919ed1363ad10bef --- /dev/null +++ b/paddle/fluid/lite/operators/batch_norm_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/lite/core/op_lite.h" +#include "paddle/fluid/lite/core/scope.h" +#include "paddle/fluid/lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class BatchNormOp : public OpLite { + public: + BatchNormOp() {} + explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "batch_norm"; } + + private: + mutable BatchNormParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/operators/batch_norm_op_test.cc b/paddle/fluid/lite/operators/batch_norm_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b91c367d92b721c1f96fd5fc92ec0b4f877408e4 --- /dev/null +++ b/paddle/fluid/lite/operators/batch_norm_op_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/batch_norm_op.h" +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +TEST(batch_norm_op_lite, test) { + // prepare variables + Scope scope; + auto* x = scope.Var("x")->GetMutable(); + auto* scale = scope.Var("scale")->GetMutable(); + auto* bias = scope.Var("bias")->GetMutable(); + auto* mean = scope.Var("mean")->GetMutable(); + auto* variance = scope.Var("variance")->GetMutable(); + auto* y = scope.Var("y")->GetMutable(); + x->Resize({2, 32, 10, 20}); + auto x_dims = x->dims(); + const int64_t channel_size = x_dims[1]; // NCHW + scale->Resize({channel_size}); + bias->Resize({channel_size}); + mean->Resize({channel_size}); + variance->Resize({channel_size}); + + // prepare op desc + cpp::OpDesc desc; + desc.SetType("batch_norm"); + desc.SetInput("X", {"x"}); + desc.SetInput("Scale", {"scale"}); + desc.SetInput("Bias", {"bias"}); + desc.SetInput("Mean", {"mean"}); + desc.SetInput("Variance", {"variance"}); + desc.SetOutput("Y", {"y"}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_global_stats", false); + desc.SetAttr("epsilon", 1e-5f); + desc.SetAttr("momentum", 0.9f); + desc.SetAttr("data_layout", std::string("NCHW")); + + BatchNormOp batch_norm("batch_norm"); + + batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}}); + batch_norm.Attach(desc, &scope); + batch_norm.CheckShape(); + batch_norm.InferShape(); + + // check output dims + auto y_dims = y->dims(); + CHECK_EQ(y_dims.size(), x_dims.size()); + for (size_t i = 0; i < y_dims.size(); i++) { + CHECK_EQ(y_dims[i], x_dims[i]); + } +} + +TEST(batch_norm_op_lite, test_enable_is_test) { + // prepare variables + Scope scope; + auto* x = scope.Var("x")->GetMutable(); + auto* scale = scope.Var("scale")->GetMutable(); + auto* bias = scope.Var("bias")->GetMutable(); + auto* mean = scope.Var("mean")->GetMutable(); + auto* variance = scope.Var("variance")->GetMutable(); + auto* y = scope.Var("y")->GetMutable(); + auto* mean_out = scope.Var("mean_out")->GetMutable(); + auto* variance_out = scope.Var("variance_out")->GetMutable(); + auto* saved_mean = scope.Var("saved_mean")->GetMutable(); + auto* saved_variance = scope.Var("saved_variance")->GetMutable(); + x->Resize({2, 32, 10, 20}); + auto x_dims = x->dims(); + const int64_t channel_size = x_dims[1]; // NCHW + scale->Resize({channel_size}); + bias->Resize({channel_size}); + mean->Resize({channel_size}); + variance->Resize({channel_size}); + + // prepare op desc + cpp::OpDesc desc; + desc.SetType("batch_norm"); + desc.SetInput("X", {"x"}); + desc.SetInput("Scale", {"scale"}); + desc.SetInput("Bias", {"bias"}); + desc.SetInput("Mean", {"mean"}); + desc.SetInput("Variance", {"variance"}); + desc.SetOutput("Y", {"y"}); + desc.SetOutput("MeanOut", {"mean_out"}); + desc.SetOutput("VarianceOut", {"variance_out"}); + desc.SetOutput("SavedMean", {"saved_mean"}); + desc.SetOutput("SavedVariance", {"saved_variance"}); + desc.SetAttr("is_test", false); + desc.SetAttr("use_global_stats", false); + desc.SetAttr("epsilon", 1e-5f); + desc.SetAttr("momentum", 0.9f); + desc.SetAttr("data_layout", std::string("NCHW")); + + BatchNormOp batch_norm("batch_norm"); + + batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}}); + batch_norm.Attach(desc, &scope); + batch_norm.CheckShape(); + batch_norm.InferShape(); + + // check output dims + auto y_dims = y->dims(); + CHECK_EQ(y_dims.size(), x_dims.size()); + for (size_t i = 0; i < y_dims.size(); i++) { + CHECK_EQ(y_dims[i], x_dims[i]); + } + auto mean_out_dims = mean_out->dims(); + auto variance_out_dims = variance_out->dims(); + auto saved_mean_dims = saved_mean->dims(); + auto saved_variance_dims = saved_variance->dims(); + CHECK_EQ(mean_out_dims.size(), 1UL); + CHECK_EQ(variance_out_dims.size(), 1UL); + CHECK_EQ(saved_mean_dims.size(), 1UL); + CHECK_EQ(saved_variance_dims.size(), 1UL); + CHECK_EQ(mean_out_dims[0], channel_size); + CHECK_EQ(variance_out_dims[0], channel_size); + CHECK_EQ(saved_mean_dims[0], channel_size); + CHECK_EQ(saved_variance_dims[0], channel_size); +} + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/operators/conv_op.cc b/paddle/fluid/lite/operators/conv_op.cc index 63838efd6fe57150dd09ca8d2608ec81f056e3dc..948e2a0641c28ed03dc5dc6cb30d60c80cec129c 100644 --- a/paddle/fluid/lite/operators/conv_op.cc +++ b/paddle/fluid/lite/operators/conv_op.cc @@ -24,31 +24,49 @@ bool ConvOpLite::CheckShape() const { CHECK_OR_FALSE(param_.x); CHECK_OR_FALSE(param_.output); CHECK_OR_FALSE(param_.filter); - return true; -} + // bias is optional. -bool ConvOpLite::InferShape() const { - auto in_dims = param_.x->dims(); - auto filter_dims = param_.filter->dims(); - std::vector strides = param_.strides; - std::vector paddings = param_.paddings; - int groups = param_.groups; - std::vector dilations = param_.dilations; + const auto in_dims = param_.x->dims(); + const auto filter_dims = param_.filter->dims(); CHECK_OR_FALSE(in_dims.size() == 4 || in_dims.size() == 5); + CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size()); - CHECK_OR_FALSE(in_dims.size() - strides.size() == 2U); - CHECK_EQ_OR_FALSE(paddings.size(), strides.size()); - CHECK_EQ_OR_FALSE(in_dims[1], filter_dims[1] * groups); - CHECK_EQ_OR_FALSE(filter_dims[0] % groups, 0); + CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U); + CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size()); + + CHECK_EQ_OR_FALSE(in_dims[1], filter_dims[1] * param_.groups); + CHECK_EQ_OR_FALSE(filter_dims[0] % param_.groups, 0); + CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL); + + return true; +} + +inline int ConvOutputSize(int input_size, int filter_size, int dilation, + int padding, int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + CHECK_GT_OR_FALSE(output_size, 0); + + return output_size; +} + +bool ConvOpLite::InferShape() const { + const auto in_dims = param_.x->dims(); + const auto filter_dims = param_.filter->dims(); std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); + for (size_t i = 0; i < param_.strides.size(); ++i) { + output_shape.push_back( + ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], param_.dilations[i], + param_.paddings[i], param_.strides[i])); } + + // Set output dims param_.output->Resize(lite::DDim(output_shape)); + + // share LoD + // param_.output->set_lod(param_.x->lod()); return true; } diff --git a/paddle/fluid/lite/operators/conv_op.h b/paddle/fluid/lite/operators/conv_op.h index 3ab30eb787bd9574a10cc9198f4c08b744eb0c27..393b5dc2a8e5e9aa8d94784bc4f5a8d041414200 100644 --- a/paddle/fluid/lite/operators/conv_op.h +++ b/paddle/fluid/lite/operators/conv_op.h @@ -26,63 +26,53 @@ namespace paddle { namespace lite { namespace operators { -inline int ConvOutputSize(int input_size, int filter_size, int dilation, - int padding, int stride) { - const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; - CHECK_OR_FALSE(output_size > 0); - - return output_size; -} - -inline bool IsExpand(const std::vector& filter_dim, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations) { - bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; - for (size_t j = 0; j < strides.size(); ++j) { - filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); - strides_1 = strides_1 && (strides[j] == 1); - padding_0 = padding_0 && (paddings[j] == 0); - dilation_1 = dilation_1 && (dilations[j] == 1); - } - return !(filter_1 && strides_1 && padding_0 && dilation_1); -} - class ConvOpLite : public OpLite { public: ConvOpLite() {} - explicit ConvOpLite(const std::string& type) : OpLite(type) {} + explicit ConvOpLite(const std::string &type) : OpLite(type) {} bool CheckShape() const override; bool InferShape() const override; - void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); } // TODO(Superjomn) replace framework::OpDesc with a lite one. - bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override { - auto X = op_desc.Input("Input").front(); - auto Filter = op_desc.Input("Filter").front(); - auto Bias = op_desc.Input("Bias").front(); - // auto ResidualData = op_desc.Input("ResidualData"); - auto Out = op_desc.Output("Output").front(); - - param_.x = scope->FindVar(X)->GetMutable(); - param_.filter = scope->FindVar(Filter)->GetMutable(); - param_.bias = scope->FindVar(Bias)->GetMutable(); - // param_.residualData = - // scope->FindVar(ResidualData)->GetMutable(); - param_.output = scope->FindVar(Out)->GetMutable(); - + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override { + auto input = op_desc.Input("Input").front(); + auto filter = op_desc.Input("Filter").front(); + auto out = op_desc.Output("Out").front(); + param_.x = scope->FindVar(input)->GetMutable(); + param_.filter = scope->FindVar(filter)->GetMutable(); + CHECK(scope->FindVar(out)); + param_.output = scope->FindVar(out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); param_.paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); param_.dilations = op_desc.GetAttr>("dilations"); - + // optional params + std::vector input_arg_names = op_desc.InputArgumentNames(); + if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") != + input_arg_names.end()) { + auto bias_var = scope->FindVar(op_desc.Input("Bias").front()); + if (bias_var != nullptr) { + param_.bias = + const_cast(&(bias_var->Get())); + } + } + if (std::find(input_arg_names.begin(), input_arg_names.end(), + "ResidualData") != input_arg_names.end()) { + auto residual_data_var = + scope->FindVar(op_desc.Input("ResidualData").front()); + if (residual_data_var != nullptr) { + param_.residualData = const_cast( + &(residual_data_var->Get())); + } + } return true; } + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "conv2d"; } private: diff --git a/paddle/fluid/lite/operators/op_params.h b/paddle/fluid/lite/operators/op_params.h index 23b21cb276442d4e1da8b83557007a132c9de3fb..91a6067959854f608e31a6151a4e63e26df7eb64 100644 --- a/paddle/fluid/lite/operators/op_params.h +++ b/paddle/fluid/lite/operators/op_params.h @@ -57,6 +57,7 @@ struct FcParam { lite::Tensor* output{}; lite::DDim in_mat_dims; int in_num_col_dims{1}; + bool weight_transposed{false}; }; struct ReluParam { @@ -124,8 +125,8 @@ struct ConcatParam { struct ConvParam { lite::Tensor* x{}; lite::Tensor* filter{}; - lite::Tensor* bias{}; - lite::Tensor* residualData{}; + lite::Tensor* bias{nullptr}; + lite::Tensor* residualData{nullptr}; lite::Tensor* output{}; std::vector strides{1, 1}; std::vector paddings{0, 0}; @@ -145,6 +146,25 @@ struct ConvParam { std::string data_format{"Anylayout"}; }; +// For BatchNorm op +struct BatchNormParam { + lite::Tensor* x{}; + lite::Tensor* bias{}; + lite::Tensor* scale{}; + lite::Tensor* mean{}; + lite::Tensor* variance{}; + lite::Tensor* y{}; + lite::Tensor* mean_out{}; + lite::Tensor* variance_out{}; + lite::Tensor* saved_mean{}; + lite::Tensor* saved_variance{}; + bool is_test{true}; + bool use_global_stats{false}; + float epsilon; + float momentum; + DataLayoutType data_layout{DATALAYOUT(kNCHW)}; +}; + // For Pooling op struct PoolParam { lite::Tensor* x{}; @@ -174,6 +194,15 @@ struct DropoutParam { std::string dropout_implementation{"downgrade_in_infer"}; }; +// For Split op +struct SplitParam { + lite::Tensor* x{}; + std::vector output{}; + int axis{-1}; + int num{0}; + std::vector sections; +}; + /// ----------------------- element wise operators ---------------------- struct ElementwiseParam { const lite::Tensor* X{}; diff --git a/paddle/fluid/lite/operators/pool_op.cc b/paddle/fluid/lite/operators/pool_op.cc index 055f00f90a47766d5a76bcf01cae3f68e14d71e2..3faf2bf0fa4f3290921a6b40739d39a2f10b9c41 100644 --- a/paddle/fluid/lite/operators/pool_op.cc +++ b/paddle/fluid/lite/operators/pool_op.cc @@ -19,6 +19,27 @@ namespace paddle { namespace lite { namespace operators { +bool PoolOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.output); + + const auto& x_dims = param_.x->dims(); + const auto& ksize = param_.ksize; + const auto& strides = param_.strides; + const auto& paddings = param_.paddings; + + // "Pooling intput should be 4-D or 5-D tensor." + CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5); + // Input size and pooling size should be consistent. + CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U); + // Strides size and pooling size should be the same. + CHECK_OR_FALSE(ksize.size() == strides.size()); + // Paddings size and pooling size should be the same. + CHECK_OR_FALSE(ksize.size() == paddings.size()); + + return true; +} + int PoolOutputSize(int input_size, int filter_size, int padding, int stride, bool ceil_mode) { int output_size; @@ -28,46 +49,35 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride, output_size = (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; } - CHECK_OR_FALSE(output_size > 0); return output_size; } -bool PoolOpLite::CheckShape() const { - CHECK_OR_FALSE(param_.x); - CHECK_OR_FALSE(param_.output); - return true; -} - bool PoolOpLite::InferShape() const { - const auto input_dims = param_.x->dims(); - CHECK_OR_FALSE(input_dims.size() == 4 || input_dims.size() == 5); - + const auto x_dims = param_.x->dims(); + std::vector& ksize = param_.ksize; if (param_.global_pooling) { - param_.ksize.resize(static_cast(input_dims.size()) - 2); - for (size_t i = 0; i < param_.ksize.size(); ++i) { + ksize.resize(static_cast(x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { param_.paddings[i] = 0; - param_.ksize[i] = static_cast(input_dims[i + 2]); + ksize[i] = static_cast(x_dims[i + 2]); } } - CHECK_OR_FALSE(input_dims.size() - param_.ksize.size() == 2U); - CHECK_EQ_OR_FALSE(param_.ksize.size(), param_.strides.size()); - CHECK_EQ_OR_FALSE(param_.ksize.size(), param_.paddings.size()); - - std::vector output_shape({input_dims[0], input_dims[1]}); + std::vector output_shape({x_dims[0], x_dims[1]}); if (param_.adaptive) { output_shape.insert(output_shape.end(), param_.ksize.begin(), param_.ksize.end()); } else { for (size_t i = 0; i < param_.ksize.size(); ++i) { output_shape.push_back( - PoolOutputSize(input_dims[i + 2], param_.ksize[i], param_.paddings[i], + PoolOutputSize(x_dims[i + 2], param_.ksize[i], param_.paddings[i], param_.strides[i], param_.ceil_mode)); } } - // share LoD - // param_.output->set_lod(param_.input->lod()); param_.output->Resize(lite::DDim(output_shape)); + + // ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + // ctx->ShareLoD("X", "Out"); return true; } diff --git a/paddle/fluid/lite/operators/pool_op.h b/paddle/fluid/lite/operators/pool_op.h index 64c15ccf1db813c2a4d0465b86ed3c6d46091f73..2e9a02eec189599ba2fc23da8e7bcc9ebd0ea8b3 100644 --- a/paddle/fluid/lite/operators/pool_op.h +++ b/paddle/fluid/lite/operators/pool_op.h @@ -13,8 +13,10 @@ // limitations under the License. #pragma once + #include #include +#include "paddle/fluid/lite/core/compatible_tensor.h" #include "paddle/fluid/lite/core/kernel.h" #include "paddle/fluid/lite/core/op_lite.h" #include "paddle/fluid/lite/core/scope.h" @@ -35,24 +37,32 @@ class PoolOpLite : public OpLite { bool InferShape() const override; - void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } // TODO(Superjomn) replace framework::OpDesc with a lite one. bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override { - auto input = op_desc.Input("X").front(); + auto x = op_desc.Input("X").front(); auto out = op_desc.Output("Out").front(); - param_.x = scope->FindVar(input)->GetMutable(); - param_.output = scope->FindVar(out)->GetMutable(); + CHECK(scope->FindVar(x)); + CHECK(scope->FindVar(out)); + param_.x = scope->FindVar(x)->GetMutable(); + param_.output = scope->FindVar(out)->GetMutable(); + param_.pooling_type = op_desc.GetAttr("pooling_type"); param_.ksize = op_desc.GetAttr>("ksize"); + param_.global_pooling = op_desc.GetAttr("global_pooling"); param_.strides = op_desc.GetAttr>("strides"); param_.paddings = op_desc.GetAttr>("paddings"); - param_.ceil_mode = op_desc.GetAttr("ceil_mode"); + + param_.exclusive = op_desc.GetAttr("exclusive"); param_.adaptive = op_desc.GetAttr("adaptive"); - param_.global_pooling = op_desc.GetAttr("global_pooling"); + param_.ceil_mode = op_desc.GetAttr("ceil_mode"); + param_.use_quantizer = op_desc.GetAttr("use_quantizer"); + // param_.data_format = op_desc.GetAttr("data_format"); return true; } + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "pool"; } private: diff --git a/paddle/fluid/lite/operators/pool_op_test.cc b/paddle/fluid/lite/operators/pool_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ab2865f1d04f2ca173b9d2f5f7d9e457f6754e8 --- /dev/null +++ b/paddle/fluid/lite/operators/pool_op_test.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/pool_op.h" +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +TEST(pool_op_lite, test) { + // prepare variables + Scope scope; + auto* x = scope.Var("x")->GetMutable(); + auto* output = scope.Var("output")->GetMutable(); + x->Resize(DDim(std::vector({1, 3, 224, 224}))); + output->Resize(DDim(std::vector{1, 3, 112, 112})); + + // set data + for (int i = 0; i < 1 * 3 * 224 * 224; i++) { + x->mutable_data()[i] = i; + } + for (int i = 0; i < 1 * 3 * 112 * 112; i++) { + output->mutable_data()[i] = 0.; + } + + // prepare op desc + cpp::OpDesc desc; + desc.SetType("pool"); + desc.SetInput("X", {"x"}); + desc.SetOutput("Out", {"output"}); + + std::string pooling_type("max"); + desc.SetAttr("pooling_type", pooling_type); + // desc.SetAttr("ksize", static_cast>({2, 2})); + std::vector ksize{2, 2}; + desc.SetAttr("ksize", ksize); + + bool global_pooling{false}; + desc.SetAttr("global_pooling", global_pooling); + + std::vector strides{1, 1}; + desc.SetAttr("strides", strides); + + std::vector paddings{0, 0}; + desc.SetAttr("paddings", paddings); + + bool exclusive{true}; + desc.SetAttr("exclusive", exclusive); + + bool adaptive{false}; + desc.SetAttr("adaptive", adaptive); + + bool ceil_mode{false}; + desc.SetAttr("ceil_mode", ceil_mode); + + bool use_quantizer{false}; + desc.SetAttr("use_quantizer", use_quantizer); + + PoolOpLite pool("pool"); + pool.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}}); + pool.Attach(desc, &scope); + auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}}); + LOG(INFO) << "kernels.size(): " << kernels.size(); +#ifdef LITE_WITH_ARM + ASSERT_FALSE(kernels.empty()); +#else + ASSERT_TRUE(kernels.empty()); +#endif +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +#ifdef LITE_WITH_ARM +USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def); +#endif diff --git a/paddle/fluid/lite/operators/split_op.cc b/paddle/fluid/lite/operators/split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d5075b0971e4bd98de8aac9810bbe7514c1a562 --- /dev/null +++ b/paddle/fluid/lite/operators/split_op.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/split_op.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SplitOp::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_GT_OR_FALSE(param_.output.size(), 1UL); + auto x_dims = param_.x->dims(); + auto x_rank = x_dims.size(); + CHECK_OR_FALSE(param_.axis >= -static_cast(x_rank) && + param_.axis < static_cast(x_rank)); + return true; +} + +bool SplitOp::InferShape() const { + const auto &outs = param_.output; + auto in_dims = param_.x->dims(); + int axis = param_.axis; + int num = param_.num; + const auto §ions = param_.sections; + + const int outs_number = outs.size(); + std::vector outs_dims; + outs_dims.reserve(outs_number); + + if (num > 0) { + int out_axis_dim = in_dims[axis] / num; + for (int i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (sections.size() > 0) { + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = sections[i]; + outs_dims.push_back(dim); + } + } + + for (int j = 0; j < outs_dims.size(); ++j) { + outs[j]->Resize(outs_dims[j]); + } + + return true; +} + +bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.axis = opdesc.GetAttr("axis"); + param_.num = opdesc.GetAttr("num"); + param_.sections = opdesc.GetAttr>("sections"); + param_.x = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + auto outs = opdesc.Output("Out"); + for (auto var : outs) { + param_.output.push_back(scope->FindVar(var)->GetMutable()); + } + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(split, paddle::lite::operators::SplitOp); diff --git a/paddle/fluid/lite/operators/split_op.h b/paddle/fluid/lite/operators/split_op.h new file mode 100644 index 0000000000000000000000000000000000000000..20dc4b1028c27f4efab558694285e44d46182ef8 --- /dev/null +++ b/paddle/fluid/lite/operators/split_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/lite/core/op_lite.h" +#include "paddle/fluid/lite/core/scope.h" +#include "paddle/fluid/lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SplitOp : public OpLite { + public: + SplitOp() {} + explicit SplitOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "split"; } + + private: + mutable SplitParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/tools/Dockerfile.mobile b/paddle/fluid/lite/tools/Dockerfile.mobile index e48af1227513feeacff78ba69236c44e4f29ab7b..6bba15b7b70594262941f8df7a088840d2cab065 100644 --- a/paddle/fluid/lite/tools/Dockerfile.mobile +++ b/paddle/fluid/lite/tools/Dockerfile.mobile @@ -88,3 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit RUN apt-get autoremove -y && apt-get clean RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz + \ No newline at end of file diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh index 54d938098749415ecf3d5a1ccf4ca72cd1b2b3e9..392e9b82bb5e66bc835f8a1c1edc21f8fc9c81d5 100755 --- a/paddle/fluid/lite/tools/build.sh +++ b/paddle/fluid/lite/tools/build.sh @@ -59,11 +59,15 @@ function cmake_arm { -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 } +function build_single { + #make $1 -j$(expr $(nproc) - 2) + make $1 -j8 +} + function build { file=$1 for _test in $(cat $file); do - #make $_test -j$(expr $(nproc) - 2) - make $_test -j8 + build_single $_test done } @@ -81,39 +85,6 @@ function test_lite { done } -port_armv8=5554 -port_armv7=5556 - -# Run test on android -function test_lite_android { - local file=$1 - local adb_abi=$2 - local port= - if [[ ${adb_abi} == "armeabi-v7a" ]]; then - port=${port_armv7} - fi - - if [[ ${adb_abi} == "arm64-v8a" ]]; then - port=${port_armv8} - fi - if [[ "${port}x" == "x" ]]; then - echo "Port can not be empty" - exit 1 - fi - - echo "file: ${file}" - # push all to adb and test - adb_work_dir="/data/local/tmp" - skip_list="test_model_parser_lite" - for _test in $(cat $file); do - [[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test' - testpath=$(find ./paddle/fluid -name ${_test}) - adb -s emulator-${port} push ${testpath} ${adb_work_dir} - adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}" - adb -s emulator-${port} shell "./${adb_work_dir}/${_test}" - done -} - # Build the code and run lite server tests. This is executed in the CI system. function build_test_server { mkdir -p ./build @@ -126,8 +97,34 @@ function build_test_server { build $LIBS_FILE } -# Build the code and run lite server tests. This is executed in the CI system. +# test_arm_android +function test_arm_android { + test_name=$1 + port=$2 + if [[ "${test_name}x" == "x" ]]; then + echo "test_name can not be empty" + exit 1 + fi + if [[ "${port}x" == "x" ]]; then + echo "Port can not be empty" + exit 1 + fi + + echo "test name: ${test_name}" + adb_work_dir="/data/local/tmp" + skip_list="test_model_parser_lite" # add more with space + [[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name' + testpath=$(find ./paddle/fluid -name ${test_name}) + adb -s emulator-${port} push ${testpath} ${adb_work_dir} + adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}" + adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}" +} + +# Build the code and run lite arm tests. This is executed in the CI system. function build_test_arm { + port_armv8=5554 + port_armv7=5556 + adb kill-server adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done # start android arm64-v8a armeabi-v7a emulators first @@ -140,6 +137,7 @@ function build_test_arm { for os in "android" "armlinux" ; do for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do + # TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux if [[ ${abi} == "armeabi-v7a-hf" ]]; then echo "armeabi-v7a-hf is not supported on both android and armlinux" continue @@ -156,17 +154,30 @@ function build_test_arm { cmake_arm ${os} ${abi} build $TESTS_FILE + # armlinux need in another docker + # TODO(TJ): enable test with armlinux if [[ ${os} == "android" ]]; then adb_abi=${abi} if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then adb_abi="armeabi-v7a" fi if [[ ${adb_abi} == "armeabi-v7a" ]]; then - # skip v7 tests + # skip all armv7 tests + # TODO(TJ): enable test with armv7 continue fi - test_lite_android $TESTS_FILE ${adb_abi} - # armlinux need in another docker + local port= + if [[ ${adb_abi} == "armeabi-v7a" ]]; then + port=${port_armv7} + fi + + if [[ ${adb_abi} == "arm64-v8a" ]]; then + port=${port_armv8} + fi + echo "test file: ${TESTS_FILE}" + for _test in $(cat $TESTS_FILE); do + test_arm_android $_test $port + done fi cd - done @@ -182,12 +193,13 @@ function print_usage { echo "----------------------------------------" echo -e "cmake_x86: run cmake with X86 mode" echo -e "cmake_cuda: run cmake with CUDA mode" - echo -e "cmake_arm: run cmake with ARM mode" + echo -e "--arm_os= --arm_abi= cmake_arm: run cmake with ARM mode" echo echo -e "build: compile the tests" + echo -e "--test_name= build_single: compile single test" echo echo -e "test_server: run server tests" - echo -e "test_mobile: run mobile tests" + echo -e "--test_name= --adb_port_number= test_arm_android: run arm test" echo "----------------------------------------" echo } @@ -200,11 +212,31 @@ function main { TESTS_FILE="${i#*=}" shift ;; + --test_name=*) + TEST_NAME="${i#*=}" + shift + ;; + --arm_os=*) + ARM_OS="${i#*=}" + shift + ;; + --arm_abi=*) + ARM_ABI="${i#*=}" + shift + ;; + --arm_port=*) + ARM_PORT="${i#*=}" + shift + ;; build) build $TESTS_FILE build $LIBS_FILE shift ;; + build_single) + build_single $TEST_NAME + shift + ;; cmake_x86) cmake_x86 shift @@ -214,15 +246,15 @@ function main { shift ;; cmake_arm) - cmake_arm $2 $3 + cmake_arm $ARM_OS $ARM_ABI shift ;; test_server) test_lite $TESTS_FILE shift ;; - test_mobile) - test_lite $TESTS_FILE + test_arm_android) + test_arm_android $TEST_NAME $ARM_PORT shift ;; build_test_server) @@ -250,6 +282,5 @@ function main { done } -print_usage - main $@ + diff --git a/paddle/fluid/lite/tools/mobile_readme.md b/paddle/fluid/lite/tools/mobile_readme.md index 2069de2af2664f31c2281d3486022f45d42e7d8e..b7ffbe6faa34860d029064246121e76c80fc06f0 100644 --- a/paddle/fluid/lite/tools/mobile_readme.md +++ b/paddle/fluid/lite/tools/mobile_readme.md @@ -124,3 +124,4 @@ $ adb devices List of devices attached 5cb00b6 device ``` + diff --git a/paddle/fluid/lite/utils/CMakeLists.txt b/paddle/fluid/lite/utils/CMakeLists.txt index 08eeaa54f8eacd359fa154762b6a1bff379686c5..f610b7aab5c25cec5d9b4fc18aecc65b3651332b 100644 --- a/paddle/fluid/lite/utils/CMakeLists.txt +++ b/paddle/fluid/lite/utils/CMakeLists.txt @@ -9,3 +9,4 @@ set(utils_DEPS glog) lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite) cc_library(any_lite SRCS any.cc) cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite) + diff --git a/paddle/fluid/lite/utils/any.h b/paddle/fluid/lite/utils/any.h index 466deae3de92ad5992695a505108e1e31b68a826..2a8c68063f0b17beb72b597d236f71e1a5c2bb79 100644 --- a/paddle/fluid/lite/utils/any.h +++ b/paddle/fluid/lite/utils/any.h @@ -34,7 +34,6 @@ class Any { CHECK(type_ == typeid(T).hash_code()); } else { type_ = typeid(T).hash_code(); - data_ = new T; deleter_ = [&] { delete static_cast(data_); }; } data_ = new T; @@ -55,10 +54,16 @@ class Any { bool valid() const { return data_; } + // ~Any() { + // if (valid()) { + // deleter_(); + // } + // } + private: static size_t kInvalidType; size_t type_{kInvalidType}; - void* data_{}; + void* data_{nullptr}; std::function deleter_; }; diff --git a/paddle/fluid/lite/x86/CMakeLists.txt b/paddle/fluid/lite/x86/CMakeLists.txt index 0347593e38af4af7cf2dd421801524bcb4d6d052..515933e2588844f2795ca676269965db9a9770fd 100644 --- a/paddle/fluid/lite/x86/CMakeLists.txt +++ b/paddle/fluid/lite/x86/CMakeLists.txt @@ -4,3 +4,4 @@ endif() cc_library(target_wrapper_x86 SRCS target_wrapper.cc) +