diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index 1c0e8e5bf9ad350e8948e06808c9510e476139bd..244467d62492bc3017ebdb6144b49ccb9fcd30c1 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -129,5 +129,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       reduce_prod.cc
       lstm.cc
       clip.cc
+      pixel_shuffle.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
diff --git a/lite/backends/arm/math/pixel_shuffle.cc b/lite/backends/arm/math/pixel_shuffle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..709b473de7b78ec82f9719b5a5309ffa851e0466
--- /dev/null
+++ b/lite/backends/arm/math/pixel_shuffle.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/pixel_shuffle.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void pixel_shuffle_scale2_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout) {
+  const int upscale_factor = 2;
+  const int feat_size_in = win * hin;
+  const int feat_size_out = wout * hout;
+
+  const int cnt = win >> 2;
+  const int remain = win - (cnt << 2);
+
+#pragma omp parallel for
+  // batch * out_channel loop
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * feat_size_out;
+    float* outptr = output + nc * feat_size_out;
+
+    // out_height loop
+    for (int h = 0; h < hin; h++) {
+      for (int sh = 0; sh < upscale_factor; sh++) {
+        const float* inptr_loc0 =
+            inptr + h * win + sh * feat_size_in * upscale_factor;
+        const float* inptr_loc1 = inptr_loc0 + feat_size_in;
+
+        // out_width loop
+        for (int i = 0; i < cnt; i++) {
+          float32x4_t vin0 = vld1q_f32(inptr_loc0);
+          float32x4_t vin1 = vld1q_f32(inptr_loc1);
+
+          float32x4x2_t vin = {vin0, vin1};
+
+          vst2q_f32(outptr, vin);
+          outptr += 8;
+
+          inptr_loc0 += 4;
+          inptr_loc1 += 4;
+        }
+
+        for (int j = 0; j < remain; j++) {
+          outptr[0] = inptr_loc0[0];
+          outptr[1] = inptr_loc1[0];
+          inptr_loc0++;
+          inptr_loc1++;
+          outptr += upscale_factor;
+        }
+      }
+    }
+  }
+}
+
+void pixel_shuffle_scale3_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout) {
+  const int upscale_factor = 3;
+  const int feat_size_in = win * hin;
+  const int feat_size_out = wout * hout;
+
+  const int cnt = win >> 2;
+  const int remain = win - (cnt << 2);
+
+#pragma omp parallel for
+  // batch * out_channel loop
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * feat_size_out;
+    float* outptr = output + nc * feat_size_out;
+
+    // out_height loop
+    for (int h = 0; h < hin; h++) {
+      for (int sh = 0; sh < upscale_factor; sh++) {
+        const float* inptr_loc0 =
+            inptr + h * win + sh * feat_size_in * upscale_factor;
+        const float* inptr_loc1 = inptr_loc0 + feat_size_in;
+        const float* inptr_loc2 = inptr_loc1 + feat_size_in;
+
+        // out_width loop
+        for (int i = 0; i < cnt; i++) {
+          float32x4_t vin0 = vld1q_f32(inptr_loc0);
+          float32x4_t vin1 = vld1q_f32(inptr_loc1);
+          float32x4_t vin2 = vld1q_f32(inptr_loc2);
+
+          float32x4x3_t vin = {vin0, vin1, vin2};
+
+          vst3q_f32(outptr, vin);
+          outptr += 12;
+
+          inptr_loc0 += 4;
+          inptr_loc1 += 4;
+          inptr_loc2 += 4;
+        }
+
+        for (int j = 0; j < remain; j++) {
+          outptr[0] = inptr_loc0[0];
+          outptr[1] = inptr_loc1[0];
+          outptr[2] = inptr_loc2[0];
+          inptr_loc0++;
+          inptr_loc1++;
+          inptr_loc2++;
+          outptr += upscale_factor;
+        }
+      }
+    }
+  }
+}
+
+void pixel_shuffle_scale4_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout) {
+  const int upscale_factor = 4;
+  const int feat_size_in = win * hin;
+  const int feat_size_out = wout * hout;
+
+  const int cnt = win >> 2;
+  const int remain = win - (cnt << 2);
+
+#pragma omp parallel for
+  // batch * out_channel loop
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * feat_size_out;
+    float* outptr = output + nc * feat_size_out;
+
+    // out_height loop
+    for (int h = 0; h < hin; h++) {
+      for (int sh = 0; sh < upscale_factor; sh++) {
+        const float* inptr_loc0 =
+            inptr + h * win + sh * feat_size_in * upscale_factor;
+        const float* inptr_loc1 = inptr_loc0 + feat_size_in;
+        const float* inptr_loc2 = inptr_loc1 + feat_size_in;
+        const float* inptr_loc3 = inptr_loc2 + feat_size_in;
+
+        // out_width loop
+        for (int i = 0; i < cnt; i++) {
+          float32x4_t vin0 = vld1q_f32(inptr_loc0);
+          float32x4_t vin1 = vld1q_f32(inptr_loc1);
+          float32x4_t vin2 = vld1q_f32(inptr_loc2);
+          float32x4_t vin3 = vld1q_f32(inptr_loc3);
+
+          float32x4x4_t vin = {vin0, vin1, vin2, vin3};
+
+          vst4q_f32(outptr, vin);
+          outptr += 16;
+
+          inptr_loc0 += 4;
+          inptr_loc1 += 4;
+          inptr_loc2 += 4;
+          inptr_loc3 += 4;
+        }
+
+        for (int j = 0; j < remain; j++) {
+          outptr[0] = inptr_loc0[0];
+          outptr[1] = inptr_loc1[0];
+          outptr[2] = inptr_loc2[0];
+          outptr[3] = inptr_loc3[0];
+          inptr_loc0++;
+          inptr_loc1++;
+          inptr_loc2++;
+          inptr_loc3++;
+          outptr += upscale_factor;
+        }
+      }
+    }
+  }
+}
+
+void pixel_shuffle_native_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout,
+                               const int upscale_factor) {
+#pragma omp parallel for
+  for (int nc = 0; nc < num * chout; nc++) {
+    const float* inptr = input + nc * hout * wout;
+    float* outptr_nc = output + nc * hout * wout;
+
+    for (int sh = 0; sh < upscale_factor; sh++) {
+      for (int sw = 0; sw < upscale_factor; sw++) {
+        float* outptr = outptr_nc + sh * wout + sw;
+        for (int h = 0; h < hin; h++) {
+          for (int w = 0; w < win; w++) {
+            outptr[0] = inptr[0];
+            inptr++;
+            outptr += upscale_factor;
+          }
+          outptr += (upscale_factor - 1) * wout;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/pixel_shuffle.h b/lite/backends/arm/math/pixel_shuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7d57cc0b568f0e2f868790f3329e4f24905393c
--- /dev/null
+++ b/lite/backends/arm/math/pixel_shuffle.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void pixel_shuffle_scale2_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout);
+void pixel_shuffle_scale3_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout);
+void pixel_shuffle_scale4_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout);
+void pixel_shuffle_native_fp32(const float* input,
+                               float* output,
+                               const int num,
+                               const int hin,
+                               const int win,
+                               const int chout,
+                               const int hout,
+                               const int wout,
+                               const int upscale_factor);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/pixel_shuffle_compute.cc b/lite/kernels/arm/pixel_shuffle_compute.cc
index 6a16dcec3ef8a46f6c17689cf2bbc5290a09724c..ad51d53f8b8ef61a8b9d984a2af3efbe40589c7b 100644
--- a/lite/kernels/arm/pixel_shuffle_compute.cc
+++ b/lite/kernels/arm/pixel_shuffle_compute.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/arm/pixel_shuffle_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/pixel_shuffle.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/core/type_system.h"
@@ -30,33 +28,52 @@ void PixelShuffleCompute::Run() {
 
   const float* x_data = param.x->data<float>();
   float* output_data = param.output->mutable_data<float>();
-  int upscale_factor = param.upscale_factor;
+  const int upscale_factor = param.upscale_factor;
 
-  int batch_size = param.x->dims()[0];
-  int height = param.x->dims()[2];
-  int width = param.x->dims()[3];
-  int out_channels = param.output->dims()[1];
-  int out_height = param.output->dims()[2];
-  int out_width = param.output->dims()[3];
+  const int batch_size = param.x->dims()[0];
+  const int height = param.x->dims()[2];
+  const int width = param.x->dims()[3];
+  const int out_channels = param.output->dims()[1];
+  const int out_height = param.output->dims()[2];
+  const int out_width = param.output->dims()[3];
 
-#pragma omp parallel for
-  for (int nc = 0; nc < batch_size * out_channels; nc++) {
-    const float* inptr = x_data + nc * out_height * out_width;
-    float* outptr_nc = output_data + nc * out_height * out_width;
-
-    for (int sh = 0; sh < upscale_factor; sh++) {
-      for (int sw = 0; sw < upscale_factor; sw++) {
-        float* outptr = outptr_nc + sh * out_width + sw;
-        for (int h = 0; h < height; h++) {
-          for (int w = 0; w < width; w++) {
-            outptr[0] = inptr[0];
-            inptr++;
-            outptr += upscale_factor;
-          }
-          outptr += (upscale_factor - 1) * out_width;
-        }
-      }
-    }
+  if (upscale_factor == 2) {
+    lite::arm::math::pixel_shuffle_scale2_fp32(x_data,
+                                               output_data,
+                                               batch_size,
+                                               height,
+                                               width,
+                                               out_channels,
+                                               out_height,
+                                               out_width);
+  } else if (upscale_factor == 3) {
+    lite::arm::math::pixel_shuffle_scale3_fp32(x_data,
+                                               output_data,
+                                               batch_size,
+                                               height,
+                                               width,
+                                               out_channels,
+                                               out_height,
+                                               out_width);
+  } else if (upscale_factor == 4) {
+    lite::arm::math::pixel_shuffle_scale4_fp32(x_data,
+                                               output_data,
+                                               batch_size,
+                                               height,
+                                               width,
+                                               out_channels,
+                                               out_height,
+                                               out_width);
+  } else {
+    lite::arm::math::pixel_shuffle_native_fp32(x_data,
+                                               output_data,
+                                               batch_size,
+                                               height,
+                                               width,
+                                               out_channels,
+                                               out_height,
+                                               out_width,
+                                               upscale_factor);
   }
 
 #ifdef LITE_WITH_PROFILE