// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "lite/backends/arm/math/pixel_shuffle.h" #include namespace paddle { namespace lite { namespace arm { namespace math { void pixel_shuffle_scale2_fp32(const float* input, float* output, const int num, const int hin, const int win, const int chout, const int hout, const int wout) { const int upscale_factor = 2; const int feat_size_in = win * hin; const int feat_size_out = wout * hout; const int cnt = win >> 2; const int remain = win - (cnt << 2); #pragma omp parallel for // batch * out_channel loop for (int nc = 0; nc < num * chout; nc++) { const float* inptr = input + nc * feat_size_out; float* outptr = output + nc * feat_size_out; // out_height loop for (int h = 0; h < hin; h++) { for (int sh = 0; sh < upscale_factor; sh++) { const float* inptr_loc0 = inptr + h * win + sh * feat_size_in * upscale_factor; const float* inptr_loc1 = inptr_loc0 + feat_size_in; // out_width loop for (int i = 0; i < cnt; i++) { float32x4_t vin0 = vld1q_f32(inptr_loc0); float32x4_t vin1 = vld1q_f32(inptr_loc1); float32x4x2_t vin = {vin0, vin1}; vst2q_f32(outptr, vin); outptr += 8; inptr_loc0 += 4; inptr_loc1 += 4; } for (int j = 0; j < remain; j++) { outptr[0] = inptr_loc0[0]; outptr[1] = inptr_loc1[0]; inptr_loc0++; inptr_loc1++; outptr += upscale_factor; } } } } } void pixel_shuffle_scale3_fp32(const float* input, float* output, const int num, const int hin, const int win, const int chout, const int hout, const int wout) { const int upscale_factor = 3; const int feat_size_in = win * hin; const int feat_size_out = wout * hout; const int cnt = win >> 2; const int remain = win - (cnt << 2); #pragma omp parallel for // batch * out_channel loop for (int nc = 0; nc < num * chout; nc++) { const float* inptr = input + nc * feat_size_out; float* outptr = output + nc * feat_size_out; // out_height loop for (int h = 0; h < hin; h++) { for (int sh = 0; sh < upscale_factor; sh++) { const float* inptr_loc0 = inptr + h * win + sh * feat_size_in * upscale_factor; const float* inptr_loc1 = inptr_loc0 + feat_size_in; const float* inptr_loc2 = inptr_loc1 + feat_size_in; // out_width loop for (int i = 0; i < cnt; i++) { float32x4_t vin0 = vld1q_f32(inptr_loc0); float32x4_t vin1 = vld1q_f32(inptr_loc1); float32x4_t vin2 = vld1q_f32(inptr_loc2); float32x4x3_t vin = {vin0, vin1, vin2}; vst3q_f32(outptr, vin); outptr += 12; inptr_loc0 += 4; inptr_loc1 += 4; inptr_loc2 += 4; } for (int j = 0; j < remain; j++) { outptr[0] = inptr_loc0[0]; outptr[1] = inptr_loc1[0]; outptr[2] = inptr_loc2[0]; inptr_loc0++; inptr_loc1++; inptr_loc2++; outptr += upscale_factor; } } } } } void pixel_shuffle_scale4_fp32(const float* input, float* output, const int num, const int hin, const int win, const int chout, const int hout, const int wout) { const int upscale_factor = 4; const int feat_size_in = win * hin; const int feat_size_out = wout * hout; const int cnt = win >> 2; const int remain = win - (cnt << 2); #pragma omp parallel for // batch * out_channel loop for (int nc = 0; nc < num * chout; nc++) { const float* inptr = input + nc * feat_size_out; float* outptr = output + nc * feat_size_out; // out_height loop for (int h = 0; h < hin; h++) { for (int sh = 0; sh < upscale_factor; sh++) { const float* inptr_loc0 = inptr + h * win + sh * feat_size_in * upscale_factor; const float* inptr_loc1 = inptr_loc0 + feat_size_in; const float* inptr_loc2 = inptr_loc1 + feat_size_in; const float* inptr_loc3 = inptr_loc2 + feat_size_in; // out_width loop for (int i = 0; i < cnt; i++) { float32x4_t vin0 = vld1q_f32(inptr_loc0); float32x4_t vin1 = vld1q_f32(inptr_loc1); float32x4_t vin2 = vld1q_f32(inptr_loc2); float32x4_t vin3 = vld1q_f32(inptr_loc3); float32x4x4_t vin = {vin0, vin1, vin2, vin3}; vst4q_f32(outptr, vin); outptr += 16; inptr_loc0 += 4; inptr_loc1 += 4; inptr_loc2 += 4; inptr_loc3 += 4; } for (int j = 0; j < remain; j++) { outptr[0] = inptr_loc0[0]; outptr[1] = inptr_loc1[0]; outptr[2] = inptr_loc2[0]; outptr[3] = inptr_loc3[0]; inptr_loc0++; inptr_loc1++; inptr_loc2++; inptr_loc3++; outptr += upscale_factor; } } } } } void pixel_shuffle_native_fp32(const float* input, float* output, const int num, const int hin, const int win, const int chout, const int hout, const int wout, const int upscale_factor) { #pragma omp parallel for for (int nc = 0; nc < num * chout; nc++) { const float* inptr = input + nc * hout * wout; float* outptr_nc = output + nc * hout * wout; for (int sh = 0; sh < upscale_factor; sh++) { for (int sw = 0; sw < upscale_factor; sw++) { float* outptr = outptr_nc + sh * wout + sw; for (int h = 0; h < hin; h++) { for (int w = 0; w < win; w++) { outptr[0] = inptr[0]; inptr++; outptr += upscale_factor; } outptr += (upscale_factor - 1) * wout; } } } } } } // namespace math } // namespace arm } // namespace lite } // namespace paddle