[ARM] Update op pixel_shuffle for arm kernel. test=develop (#4145)

dfdfa644 · zhaoyang-star · GitHub · 8de02e61 · dfdfa644 · dfdfa644
4 changed file
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -129,5 +129,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      reduce_prod.cc
      lstm.cc
      clip.cc
+      pixel_shuffle.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
--- a/lite/backends/arm/math/pixel_shuffle.cc
+++ b/lite/backends/arm/math/pixel_shuffle.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/arm/math/pixel_shuffle.h"
+#include <arm_neon.h>
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void pixel_shuffle_scale2_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout) {
+  const int upscale_factor = 2;
+  const int feat_size_in = win * hin;
+  const int feat_size_out = wout * hout;
+  const int cnt = win >> 2;
+  const int remain = win - (cnt << 2);
+#pragma omp parallel for
+  // batch * out_channel loop
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * feat_size_out;
+    float* outptr = output + nc * feat_size_out;
+    // out_height loop
+    for (int h = 0; h < hin; h++) {
+      for (int sh = 0; sh < upscale_factor; sh++) {
+        const float* inptr_loc0 =
+            inptr + h * win + sh * feat_size_in * upscale_factor;
+        const float* inptr_loc1 = inptr_loc0 + feat_size_in;
+        // out_width loop
+        for (int i = 0; i < cnt; i++) {
+          float32x4_t vin0 = vld1q_f32(inptr_loc0);
+          float32x4_t vin1 = vld1q_f32(inptr_loc1);
+          float32x4x2_t vin = {vin0, vin1};
+          vst2q_f32(outptr, vin);
+          outptr += 8;
+          inptr_loc0 += 4;
+          inptr_loc1 += 4;
+        }
+        for (int j = 0; j < remain; j++) {
+          outptr[0] = inptr_loc0[0];
+          outptr[1] = inptr_loc1[0];
+          inptr_loc0++;
+          inptr_loc1++;
+          outptr += upscale_factor;
+        }
+      }
+    }
+  }
+}
+void pixel_shuffle_scale3_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout) {
+  const int upscale_factor = 3;
+  const int feat_size_in = win * hin;
+  const int feat_size_out = wout * hout;
+  const int cnt = win >> 2;
+  const int remain = win - (cnt << 2);
+#pragma omp parallel for
+  // batch * out_channel loop
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * feat_size_out;
+    float* outptr = output + nc * feat_size_out;
+    // out_height loop
+    for (int h = 0; h < hin; h++) {
+      for (int sh = 0; sh < upscale_factor; sh++) {
+        const float* inptr_loc0 =
+            inptr + h * win + sh * feat_size_in * upscale_factor;
+        const float* inptr_loc1 = inptr_loc0 + feat_size_in;
+        const float* inptr_loc2 = inptr_loc1 + feat_size_in;
+        // out_width loop
+        for (int i = 0; i < cnt; i++) {
+          float32x4_t vin0 = vld1q_f32(inptr_loc0);
+          float32x4_t vin1 = vld1q_f32(inptr_loc1);
+          float32x4_t vin2 = vld1q_f32(inptr_loc2);
+          float32x4x3_t vin = {vin0, vin1, vin2};
+          vst3q_f32(outptr, vin);
+          outptr += 12;
+          inptr_loc0 += 4;
+          inptr_loc1 += 4;
+          inptr_loc2 += 4;
+        }
+        for (int j = 0; j < remain; j++) {
+          outptr[0] = inptr_loc0[0];
+          outptr[1] = inptr_loc1[0];
+          outptr[2] = inptr_loc2[0];
+          inptr_loc0++;
+          inptr_loc1++;
+          inptr_loc2++;
+          outptr += upscale_factor;
+        }
+      }
+    }
+  }
+}
+void pixel_shuffle_scale4_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout) {
+  const int upscale_factor = 4;
+  const int feat_size_in = win * hin;
+  const int feat_size_out = wout * hout;
+  const int cnt = win >> 2;
+  const int remain = win - (cnt << 2);
+#pragma omp parallel for
+  // batch * out_channel loop
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * feat_size_out;
+    float* outptr = output + nc * feat_size_out;
+    // out_height loop
+    for (int h = 0; h < hin; h++) {
+      for (int sh = 0; sh < upscale_factor; sh++) {
+        const float* inptr_loc0 =
+            inptr + h * win + sh * feat_size_in * upscale_factor;
+        const float* inptr_loc1 = inptr_loc0 + feat_size_in;
+        const float* inptr_loc2 = inptr_loc1 + feat_size_in;
+        const float* inptr_loc3 = inptr_loc2 + feat_size_in;
+        // out_width loop
+        for (int i = 0; i < cnt; i++) {
+          float32x4_t vin0 = vld1q_f32(inptr_loc0);
+          float32x4_t vin1 = vld1q_f32(inptr_loc1);
+          float32x4_t vin2 = vld1q_f32(inptr_loc2);
+          float32x4_t vin3 = vld1q_f32(inptr_loc3);
+          float32x4x4_t vin = {vin0, vin1, vin2, vin3};
+          vst4q_f32(outptr, vin);
+          outptr += 16;
+          inptr_loc0 += 4;
+          inptr_loc1 += 4;
+          inptr_loc2 += 4;
+          inptr_loc3 += 4;
+        }
+        for (int j = 0; j < remain; j++) {
+          outptr[0] = inptr_loc0[0];
+          outptr[1] = inptr_loc1[0];
+          outptr[2] = inptr_loc2[0];
+          outptr[3] = inptr_loc3[0];
+          inptr_loc0++;
+          inptr_loc1++;
+          inptr_loc2++;
+          inptr_loc3++;
+          outptr += upscale_factor;
+        }
+      }
+    }
+  }
+}
+void pixel_shuffle_native_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout,
+                               const int upscale_factor) {
+#pragma omp parallel for
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * hout * wout;
+    float* outptr_nc = output + nc * hout * wout;
+    for (int sh = 0; sh < upscale_factor; sh++) {
+      for (int sw = 0; sw < upscale_factor; sw++) {
+        float* outptr = outptr_nc + sh * wout + sw;
+        for (int h = 0; h < hin; h++) {
+          for (int w = 0; w < win; w++) {
+            outptr[0] = inptr[0];
+            inptr++;
+            outptr += upscale_factor;
+          }
+          outptr += (upscale_factor - 1) * wout;
+        }
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/pixel_shuffle.h
+++ b/lite/backends/arm/math/pixel_shuffle.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void pixel_shuffle_scale2_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout);
+void pixel_shuffle_scale3_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout);
+void pixel_shuffle_scale4_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout);
+void pixel_shuffle_native_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout,
+                               const int upscale_factor);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/pixel_shuffle_compute.cc
+++ b/lite/kernels/arm/pixel_shuffle_compute.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 #include "lite/kernels/arm/pixel_shuffle_compute.h"
-#include <string>
+#include "lite/backends/arm/math/pixel_shuffle.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/core/type_system.h"
@@ -30,33 +28,52 @@ void PixelShuffleCompute::Run() {
  const float* x_data = param.x->data<float>();
  float* output_data = param.output->mutable_data<float>();
-  int upscale_factor = param.upscale_factor;
+  const int upscale_factor = param.upscale_factor;
-  int batch_size = param.x->dims()[0];
+  const int batch_size = param.x->dims()[0];
-  int height = param.x->dims()[2];
+  const int height = param.x->dims()[2];
-  int width = param.x->dims()[3];
+  const int width = param.x->dims()[3];
-  int out_channels = param.output->dims()[1];
+  const int out_channels = param.output->dims()[1];
-  int out_height = param.output->dims()[2];
+  const int out_height = param.output->dims()[2];
-  int out_width = param.output->dims()[3];
+  const int out_width = param.output->dims()[3];
-#pragma omp parallel for
+  if (upscale_factor == 2) {
-  for (int nc = 0; nc < batch_size * out_channels; nc++) {
+    lite::arm::math::pixel_shuffle_scale2_fp32(x_data,
-    const float* inptr = x_data + nc * out_height * out_width;
+                                               output_data,
-    float* outptr_nc = output_data + nc * out_height * out_width;
+                                               batch_size,
+                                               height,
-    for (int sh = 0; sh < upscale_factor; sh++) {
+                                               width,
-      for (int sw = 0; sw < upscale_factor; sw++) {
+                                               out_channels,
-        float* outptr = outptr_nc + sh * out_width + sw;
+                                               out_height,
-        for (int h = 0; h < height; h++) {
+                                               out_width);
-          for (int w = 0; w < width; w++) {
+  } else if (upscale_factor == 3) {
-            outptr[0] = inptr[0];
+    lite::arm::math::pixel_shuffle_scale3_fp32(x_data,
-            inptr++;
+                                               output_data,
-            outptr += upscale_factor;
+                                               batch_size,
-          }
+                                               height,
-          outptr += (upscale_factor - 1) * out_width;
+                                               width,
-        }
+                                               out_channels,
-      }
+                                               out_height,
-    }
+                                               out_width);
+  } else if (upscale_factor == 4) {
+    lite::arm::math::pixel_shuffle_scale4_fp32(x_data,
+                                               output_data,
+                                               batch_size,
+                                               height,
+                                               width,
+                                               out_channels,
+                                               out_height,
+                                               out_width);
+  } else {
+    lite::arm::math::pixel_shuffle_native_fp32(x_data,
+                                               output_data,
+                                               batch_size,
+                                               height,
+                                               width,
+                                               out_channels,
+                                               out_height,
+                                               out_width,
+                                               upscale_factor);
  }
 #ifdef LITE_WITH_PROFILE