diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index 1c0e8e5bf9ad350e8948e06808c9510e476139bd..244467d62492bc3017ebdb6144b49ccb9fcd30c1 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -129,5 +129,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) reduce_prod.cc lstm.cc clip.cc + pixel_shuffle.cc DEPS ${lite_kernel_deps} context tensor) endif() diff --git a/lite/backends/arm/math/pixel_shuffle.cc b/lite/backends/arm/math/pixel_shuffle.cc new file mode 100644 index 0000000000000000000000000000000000000000..709b473de7b78ec82f9719b5a5309ffa851e0466 --- /dev/null +++ b/lite/backends/arm/math/pixel_shuffle.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/pixel_shuffle.h" +#include + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void pixel_shuffle_scale2_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout) { + const int upscale_factor = 2; + const int feat_size_in = win * hin; + const int feat_size_out = wout * hout; + + const int cnt = win >> 2; + const int remain = win - (cnt << 2); + +#pragma omp parallel for + // batch * out_channel loop + for (int nc = 0; nc < num * chout; nc++) { + const float* inptr = input + nc * feat_size_out; + float* outptr = output + nc * feat_size_out; + + // out_height loop + for (int h = 0; h < hin; h++) { + for (int sh = 0; sh < upscale_factor; sh++) { + const float* inptr_loc0 = + inptr + h * win + sh * feat_size_in * upscale_factor; + const float* inptr_loc1 = inptr_loc0 + feat_size_in; + + // out_width loop + for (int i = 0; i < cnt; i++) { + float32x4_t vin0 = vld1q_f32(inptr_loc0); + float32x4_t vin1 = vld1q_f32(inptr_loc1); + + float32x4x2_t vin = {vin0, vin1}; + + vst2q_f32(outptr, vin); + outptr += 8; + + inptr_loc0 += 4; + inptr_loc1 += 4; + } + + for (int j = 0; j < remain; j++) { + outptr[0] = inptr_loc0[0]; + outptr[1] = inptr_loc1[0]; + inptr_loc0++; + inptr_loc1++; + outptr += upscale_factor; + } + } + } + } +} + +void pixel_shuffle_scale3_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout) { + const int upscale_factor = 3; + const int feat_size_in = win * hin; + const int feat_size_out = wout * hout; + + const int cnt = win >> 2; + const int remain = win - (cnt << 2); + +#pragma omp parallel for + // batch * out_channel loop + for (int nc = 0; nc < num * chout; nc++) { + const float* inptr = input + nc * feat_size_out; + float* outptr = output + nc * feat_size_out; + + // out_height loop + for (int h = 0; h < hin; h++) { + for (int sh = 0; sh < upscale_factor; sh++) { + const float* inptr_loc0 = + inptr + h * win + sh * feat_size_in * upscale_factor; + const float* inptr_loc1 = inptr_loc0 + feat_size_in; + const float* inptr_loc2 = inptr_loc1 + feat_size_in; + + // out_width loop + for (int i = 0; i < cnt; i++) { + float32x4_t vin0 = vld1q_f32(inptr_loc0); + float32x4_t vin1 = vld1q_f32(inptr_loc1); + float32x4_t vin2 = vld1q_f32(inptr_loc2); + + float32x4x3_t vin = {vin0, vin1, vin2}; + + vst3q_f32(outptr, vin); + outptr += 12; + + inptr_loc0 += 4; + inptr_loc1 += 4; + inptr_loc2 += 4; + } + + for (int j = 0; j < remain; j++) { + outptr[0] = inptr_loc0[0]; + outptr[1] = inptr_loc1[0]; + outptr[2] = inptr_loc2[0]; + inptr_loc0++; + inptr_loc1++; + inptr_loc2++; + outptr += upscale_factor; + } + } + } + } +} + +void pixel_shuffle_scale4_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout) { + const int upscale_factor = 4; + const int feat_size_in = win * hin; + const int feat_size_out = wout * hout; + + const int cnt = win >> 2; + const int remain = win - (cnt << 2); + +#pragma omp parallel for + // batch * out_channel loop + for (int nc = 0; nc < num * chout; nc++) { + const float* inptr = input + nc * feat_size_out; + float* outptr = output + nc * feat_size_out; + + // out_height loop + for (int h = 0; h < hin; h++) { + for (int sh = 0; sh < upscale_factor; sh++) { + const float* inptr_loc0 = + inptr + h * win + sh * feat_size_in * upscale_factor; + const float* inptr_loc1 = inptr_loc0 + feat_size_in; + const float* inptr_loc2 = inptr_loc1 + feat_size_in; + const float* inptr_loc3 = inptr_loc2 + feat_size_in; + + // out_width loop + for (int i = 0; i < cnt; i++) { + float32x4_t vin0 = vld1q_f32(inptr_loc0); + float32x4_t vin1 = vld1q_f32(inptr_loc1); + float32x4_t vin2 = vld1q_f32(inptr_loc2); + float32x4_t vin3 = vld1q_f32(inptr_loc3); + + float32x4x4_t vin = {vin0, vin1, vin2, vin3}; + + vst4q_f32(outptr, vin); + outptr += 16; + + inptr_loc0 += 4; + inptr_loc1 += 4; + inptr_loc2 += 4; + inptr_loc3 += 4; + } + + for (int j = 0; j < remain; j++) { + outptr[0] = inptr_loc0[0]; + outptr[1] = inptr_loc1[0]; + outptr[2] = inptr_loc2[0]; + outptr[3] = inptr_loc3[0]; + inptr_loc0++; + inptr_loc1++; + inptr_loc2++; + inptr_loc3++; + outptr += upscale_factor; + } + } + } + } +} + +void pixel_shuffle_native_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout, + const int upscale_factor) { +#pragma omp parallel for + for (int nc = 0; nc < num * chout; nc++) { + const float* inptr = input + nc * hout * wout; + float* outptr_nc = output + nc * hout * wout; + + for (int sh = 0; sh < upscale_factor; sh++) { + for (int sw = 0; sw < upscale_factor; sw++) { + float* outptr = outptr_nc + sh * wout + sw; + for (int h = 0; h < hin; h++) { + for (int w = 0; w < win; w++) { + outptr[0] = inptr[0]; + inptr++; + outptr += upscale_factor; + } + outptr += (upscale_factor - 1) * wout; + } + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/pixel_shuffle.h b/lite/backends/arm/math/pixel_shuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..c7d57cc0b568f0e2f868790f3329e4f24905393c --- /dev/null +++ b/lite/backends/arm/math/pixel_shuffle.h @@ -0,0 +1,61 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void pixel_shuffle_scale2_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout); +void pixel_shuffle_scale3_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout); +void pixel_shuffle_scale4_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout); +void pixel_shuffle_native_fp32(const float* input, + float* output, + const int num, + const int hin, + const int win, + const int chout, + const int hout, + const int wout, + const int upscale_factor); + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/pixel_shuffle_compute.cc b/lite/kernels/arm/pixel_shuffle_compute.cc index 6a16dcec3ef8a46f6c17689cf2bbc5290a09724c..ad51d53f8b8ef61a8b9d984a2af3efbe40589c7b 100644 --- a/lite/kernels/arm/pixel_shuffle_compute.cc +++ b/lite/kernels/arm/pixel_shuffle_compute.cc @@ -13,9 +13,7 @@ // limitations under the License. #include "lite/kernels/arm/pixel_shuffle_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" +#include "lite/backends/arm/math/pixel_shuffle.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" #include "lite/core/type_system.h" @@ -30,33 +28,52 @@ void PixelShuffleCompute::Run() { const float* x_data = param.x->data(); float* output_data = param.output->mutable_data(); - int upscale_factor = param.upscale_factor; + const int upscale_factor = param.upscale_factor; - int batch_size = param.x->dims()[0]; - int height = param.x->dims()[2]; - int width = param.x->dims()[3]; - int out_channels = param.output->dims()[1]; - int out_height = param.output->dims()[2]; - int out_width = param.output->dims()[3]; + const int batch_size = param.x->dims()[0]; + const int height = param.x->dims()[2]; + const int width = param.x->dims()[3]; + const int out_channels = param.output->dims()[1]; + const int out_height = param.output->dims()[2]; + const int out_width = param.output->dims()[3]; -#pragma omp parallel for - for (int nc = 0; nc < batch_size * out_channels; nc++) { - const float* inptr = x_data + nc * out_height * out_width; - float* outptr_nc = output_data + nc * out_height * out_width; - - for (int sh = 0; sh < upscale_factor; sh++) { - for (int sw = 0; sw < upscale_factor; sw++) { - float* outptr = outptr_nc + sh * out_width + sw; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - outptr[0] = inptr[0]; - inptr++; - outptr += upscale_factor; - } - outptr += (upscale_factor - 1) * out_width; - } - } - } + if (upscale_factor == 2) { + lite::arm::math::pixel_shuffle_scale2_fp32(x_data, + output_data, + batch_size, + height, + width, + out_channels, + out_height, + out_width); + } else if (upscale_factor == 3) { + lite::arm::math::pixel_shuffle_scale3_fp32(x_data, + output_data, + batch_size, + height, + width, + out_channels, + out_height, + out_width); + } else if (upscale_factor == 4) { + lite::arm::math::pixel_shuffle_scale4_fp32(x_data, + output_data, + batch_size, + height, + width, + out_channels, + out_height, + out_width); + } else { + lite::arm::math::pixel_shuffle_native_fp32(x_data, + output_data, + batch_size, + height, + width, + out_channels, + out_height, + out_width, + upscale_factor); } #ifdef LITE_WITH_PROFILE