未验证 提交 dfdfa644 编写于 作者: Z zhaoyang-star 提交者: GitHub

[ARM] Update op pixel_shuffle for arm kernel. test=develop (#4145)

上级 8de02e61
...@@ -129,5 +129,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -129,5 +129,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
reduce_prod.cc reduce_prod.cc
lstm.cc lstm.cc
clip.cc clip.cc
pixel_shuffle.cc
DEPS ${lite_kernel_deps} context tensor) DEPS ${lite_kernel_deps} context tensor)
endif() endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/arm/math/pixel_shuffle.h"
#include <arm_neon.h>
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void pixel_shuffle_scale2_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout) {
const int upscale_factor = 2;
const int feat_size_in = win * hin;
const int feat_size_out = wout * hout;
const int cnt = win >> 2;
const int remain = win - (cnt << 2);
#pragma omp parallel for
// batch * out_channel loop
for (int nc = 0; nc < num * chout; nc++) {
const float* inptr = input + nc * feat_size_out;
float* outptr = output + nc * feat_size_out;
// out_height loop
for (int h = 0; h < hin; h++) {
for (int sh = 0; sh < upscale_factor; sh++) {
const float* inptr_loc0 =
inptr + h * win + sh * feat_size_in * upscale_factor;
const float* inptr_loc1 = inptr_loc0 + feat_size_in;
// out_width loop
for (int i = 0; i < cnt; i++) {
float32x4_t vin0 = vld1q_f32(inptr_loc0);
float32x4_t vin1 = vld1q_f32(inptr_loc1);
float32x4x2_t vin = {vin0, vin1};
vst2q_f32(outptr, vin);
outptr += 8;
inptr_loc0 += 4;
inptr_loc1 += 4;
}
for (int j = 0; j < remain; j++) {
outptr[0] = inptr_loc0[0];
outptr[1] = inptr_loc1[0];
inptr_loc0++;
inptr_loc1++;
outptr += upscale_factor;
}
}
}
}
}
void pixel_shuffle_scale3_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout) {
const int upscale_factor = 3;
const int feat_size_in = win * hin;
const int feat_size_out = wout * hout;
const int cnt = win >> 2;
const int remain = win - (cnt << 2);
#pragma omp parallel for
// batch * out_channel loop
for (int nc = 0; nc < num * chout; nc++) {
const float* inptr = input + nc * feat_size_out;
float* outptr = output + nc * feat_size_out;
// out_height loop
for (int h = 0; h < hin; h++) {
for (int sh = 0; sh < upscale_factor; sh++) {
const float* inptr_loc0 =
inptr + h * win + sh * feat_size_in * upscale_factor;
const float* inptr_loc1 = inptr_loc0 + feat_size_in;
const float* inptr_loc2 = inptr_loc1 + feat_size_in;
// out_width loop
for (int i = 0; i < cnt; i++) {
float32x4_t vin0 = vld1q_f32(inptr_loc0);
float32x4_t vin1 = vld1q_f32(inptr_loc1);
float32x4_t vin2 = vld1q_f32(inptr_loc2);
float32x4x3_t vin = {vin0, vin1, vin2};
vst3q_f32(outptr, vin);
outptr += 12;
inptr_loc0 += 4;
inptr_loc1 += 4;
inptr_loc2 += 4;
}
for (int j = 0; j < remain; j++) {
outptr[0] = inptr_loc0[0];
outptr[1] = inptr_loc1[0];
outptr[2] = inptr_loc2[0];
inptr_loc0++;
inptr_loc1++;
inptr_loc2++;
outptr += upscale_factor;
}
}
}
}
}
void pixel_shuffle_scale4_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout) {
const int upscale_factor = 4;
const int feat_size_in = win * hin;
const int feat_size_out = wout * hout;
const int cnt = win >> 2;
const int remain = win - (cnt << 2);
#pragma omp parallel for
// batch * out_channel loop
for (int nc = 0; nc < num * chout; nc++) {
const float* inptr = input + nc * feat_size_out;
float* outptr = output + nc * feat_size_out;
// out_height loop
for (int h = 0; h < hin; h++) {
for (int sh = 0; sh < upscale_factor; sh++) {
const float* inptr_loc0 =
inptr + h * win + sh * feat_size_in * upscale_factor;
const float* inptr_loc1 = inptr_loc0 + feat_size_in;
const float* inptr_loc2 = inptr_loc1 + feat_size_in;
const float* inptr_loc3 = inptr_loc2 + feat_size_in;
// out_width loop
for (int i = 0; i < cnt; i++) {
float32x4_t vin0 = vld1q_f32(inptr_loc0);
float32x4_t vin1 = vld1q_f32(inptr_loc1);
float32x4_t vin2 = vld1q_f32(inptr_loc2);
float32x4_t vin3 = vld1q_f32(inptr_loc3);
float32x4x4_t vin = {vin0, vin1, vin2, vin3};
vst4q_f32(outptr, vin);
outptr += 16;
inptr_loc0 += 4;
inptr_loc1 += 4;
inptr_loc2 += 4;
inptr_loc3 += 4;
}
for (int j = 0; j < remain; j++) {
outptr[0] = inptr_loc0[0];
outptr[1] = inptr_loc1[0];
outptr[2] = inptr_loc2[0];
outptr[3] = inptr_loc3[0];
inptr_loc0++;
inptr_loc1++;
inptr_loc2++;
inptr_loc3++;
outptr += upscale_factor;
}
}
}
}
}
void pixel_shuffle_native_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout,
const int upscale_factor) {
#pragma omp parallel for
for (int nc = 0; nc < num * chout; nc++) {
const float* inptr = input + nc * hout * wout;
float* outptr_nc = output + nc * hout * wout;
for (int sh = 0; sh < upscale_factor; sh++) {
for (int sw = 0; sw < upscale_factor; sw++) {
float* outptr = outptr_nc + sh * wout + sw;
for (int h = 0; h < hin; h++) {
for (int w = 0; w < win; w++) {
outptr[0] = inptr[0];
inptr++;
outptr += upscale_factor;
}
outptr += (upscale_factor - 1) * wout;
}
}
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void pixel_shuffle_scale2_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout);
void pixel_shuffle_scale3_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout);
void pixel_shuffle_scale4_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout);
void pixel_shuffle_native_fp32(const float* input,
float* output,
const int num,
const int hin,
const int win,
const int chout,
const int hout,
const int wout,
const int upscale_factor);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
...@@ -13,9 +13,7 @@ ...@@ -13,9 +13,7 @@
// limitations under the License. // limitations under the License.
#include "lite/kernels/arm/pixel_shuffle_compute.h" #include "lite/kernels/arm/pixel_shuffle_compute.h"
#include <string> #include "lite/backends/arm/math/pixel_shuffle.h"
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
...@@ -30,33 +28,52 @@ void PixelShuffleCompute::Run() { ...@@ -30,33 +28,52 @@ void PixelShuffleCompute::Run() {
const float* x_data = param.x->data<float>(); const float* x_data = param.x->data<float>();
float* output_data = param.output->mutable_data<float>(); float* output_data = param.output->mutable_data<float>();
int upscale_factor = param.upscale_factor; const int upscale_factor = param.upscale_factor;
int batch_size = param.x->dims()[0]; const int batch_size = param.x->dims()[0];
int height = param.x->dims()[2]; const int height = param.x->dims()[2];
int width = param.x->dims()[3]; const int width = param.x->dims()[3];
int out_channels = param.output->dims()[1]; const int out_channels = param.output->dims()[1];
int out_height = param.output->dims()[2]; const int out_height = param.output->dims()[2];
int out_width = param.output->dims()[3]; const int out_width = param.output->dims()[3];
#pragma omp parallel for if (upscale_factor == 2) {
for (int nc = 0; nc < batch_size * out_channels; nc++) { lite::arm::math::pixel_shuffle_scale2_fp32(x_data,
const float* inptr = x_data + nc * out_height * out_width; output_data,
float* outptr_nc = output_data + nc * out_height * out_width; batch_size,
height,
for (int sh = 0; sh < upscale_factor; sh++) { width,
for (int sw = 0; sw < upscale_factor; sw++) { out_channels,
float* outptr = outptr_nc + sh * out_width + sw; out_height,
for (int h = 0; h < height; h++) { out_width);
for (int w = 0; w < width; w++) { } else if (upscale_factor == 3) {
outptr[0] = inptr[0]; lite::arm::math::pixel_shuffle_scale3_fp32(x_data,
inptr++; output_data,
outptr += upscale_factor; batch_size,
} height,
outptr += (upscale_factor - 1) * out_width; width,
} out_channels,
} out_height,
} out_width);
} else if (upscale_factor == 4) {
lite::arm::math::pixel_shuffle_scale4_fp32(x_data,
output_data,
batch_size,
height,
width,
out_channels,
out_height,
out_width);
} else {
lite::arm::math::pixel_shuffle_native_fp32(x_data,
output_data,
batch_size,
height,
width,
out_channels,
out_height,
out_width,
upscale_factor);
} }
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册