1. fix group logic for convolution op. 2. add pixel shuffle op for OpenCL. (#2178)

77811367 · Yanzhan Yang · GitHub · 7931104f · 77811367 · 77811367
12 changed file
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
@@ -133,6 +133,7 @@ const char *G_OP_TYPE_BEAM_SEARCH_DECODE = "beam_search_decode";
 const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
    "fill_constant_batch_size_like";
 const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
+const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -256,5 +257,6 @@ std::unordered_map<
        {G_OP_TYPE_BEAM_SEARCH_DECODE,
         {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
        {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
@@ -264,6 +264,7 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
+extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -377,3 +377,6 @@ LOAD_OP1(range, CPU);
 #ifdef REDUCE_PROD_OP
 LOAD_OP1(reduce_prod, CPU);
 #endif
+#ifdef PIXEL_SHUFFLE_OP
+LOAD_OP1(pixel_shuffle, GPU_CL);
+#endif
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -59,6 +59,7 @@ void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
  int input_height = param.Input()->dims()[2];
  int output_width = param.Output()->dims()[3];
  int output_height = param.Output()->dims()[2];
+  int output_c = param.Output()->dims()[1];
  int filter_channel = param.Filter()->dims()[1];
  int input_channel = param.Input()->dims()[1];
  //
@@ -216,6 +217,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
  int input_height = param.Input()->dims()[2];
  int output_width = param.Output()->dims()[3];
  int output_height = param.Output()->dims()[2];
+  int output_c = param.Output()->dims()[1];
  int filter_channel = param.Filter()->dims()[1];
  int input_channel = param.Input()->dims()[1];
@@ -397,21 +399,21 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_c);
+    CL_CHECK_ERRORS(status);
    if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
      if (filter_channel != input_channel) {
-        if (filter_channel != 1) {
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-          status =
-              clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
        CL_CHECK_ERRORS(status);
-          int has_group = 1;
+        int group = input_channel / filter_channel;
-          status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
+        status = clSetKernelArg(kernel, index++, sizeof(int), &group);
        CL_CHECK_ERRORS(status);
-        }
      } else {
        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
        CL_CHECK_ERRORS(status);
-        int has_group = 0;
+        int group = 1;
-        status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
+        status = clSetKernelArg(kernel, index++, sizeof(int), &group);
        CL_CHECK_ERRORS(status);
      }
    }

--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -48,8 +48,9 @@ __kernel void conv_3x3(__private const int global_size_dim0,
                                              __private const int input_height,/* of one block */
                                              __private const int output_width,
                                              __private const int output_height,
+                                              __private const int output_c,
                                              __private const int filter_channel,
-                                              __private const int has_group) {
+                                              __private const int group) {
    const int out_c = get_global_id(0);
    const int out_w = get_global_id(1);
@@ -90,7 +91,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
 #endif
    half4 input[9];
-    if (has_group == 0) {
+    if (group == 1) {
        for (int i = 0; i < input_c; ++i) {
            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
            input[0] = select(read_imageh(input_image, sampler,
@@ -326,7 +327,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
        }
    } else {
        for (int i = 0; i < 4; i++) {
-            int used_input_channel_num = (out_c * 4 + i) * filter_channel;
+            int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel;
            for (int f_c = 0; f_c < filter_channel; ++f_c) {
                int input_c = used_input_channel_num + f_c;
                int input_block = input_c / 4;

--- a/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void pixel_shuffle(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int in_N,
+                        __private const int in_C,
+                        __private const int in_H,
+                        __private const int in_W,
+                        __private const int out_N,
+                        __private const int out_C,
+                        __private const int out_H,
+                        __private const int out_W,
+                        __private const int upscale_factor) {
+                        const int out_c4 = get_global_id(0);
+                        const int out_w = get_global_id(1);
+                        const int out_nh = get_global_id(2);
+                        int out_h = out_nh % out_H;
+                        int out_n = out_nh / out_H;
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                    CLK_ADDRESS_CLAMP |
+                                                    CLK_FILTER_NEAREST;
+                        int in_h = out_h / upscale_factor;
+                        int in_w = out_w / upscale_factor;
+                        int in_nh = out_n * in_H + in_h;
+                        half4 res;
+                        int out_c;
+                        int in_c;
+                        half4 in;
+                        int2 in_pos;
+                        out_c = out_c4 * 4 + 0;
+                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+                        in_pos.x = (in_c / 4) * in_W + in_w;
+                        in_pos.y = in_nh;
+                        in = read_imageh(input_image, sampler, in_pos);
+                        if (in_c % 4 == 0) {
+                            res.x = in.x;
+                        } else if (in_c % 4 == 1) {
+                            res.x = in.y;
+                        } else if (in_c % 4 == 2) {
+                            res.x = in.z;
+                        } else if (in_c % 4 == 3) {
+                            res.x = in.w;
+                        }
+                        out_c = out_c4 * 4 + 1;
+                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+                        in_pos.x = (in_c / 4) * in_W + in_w;
+                        in_pos.y = in_nh;
+                        in = read_imageh(input_image, sampler, in_pos);
+                        if (in_c % 4 == 0) {
+                            res.y = in.x;
+                        } else if (in_c % 4 == 1) {
+                            res.y = in.y;
+                        } else if (in_c % 4 == 2) {
+                            res.y = in.z;
+                        } else if (in_c % 4 == 3) {
+                            res.y = in.w;
+                        }
+                        out_c = out_c4 * 4 + 2;
+                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+                        in_pos.x = (in_c / 4) * in_W + in_w;
+                        in_pos.y = in_nh;
+                        in = read_imageh(input_image, sampler, in_pos);
+                        if (in_c % 4 == 0) {
+                            res.z = in.x;
+                        } else if (in_c % 4 == 1) {
+                            res.z = in.y;
+                        } else if (in_c % 4 == 2) {
+                            res.z = in.z;
+                        } else if (in_c % 4 == 3) {
+                            res.z = in.w;
+                        }
+                        out_c = out_c4 * 4 + 3;
+                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+                        in_pos.x = (in_c / 4) * in_W + in_w;
+                        in_pos.y = in_nh;
+                        in = read_imageh(input_image, sampler, in_pos);
+                        if (in_c % 4 == 0) {
+                            res.w = in.x;
+                        } else if (in_c % 4 == 1) {
+                            res.w = in.y;
+                        } else if (in_c % 4 == 2) {
+                            res.w = in.z;
+                        } else if (in_c % 4 == 3) {
+                            res.w = in.w;
+                        }
+                        int2 out_pos;
+                        out_pos.x = out_c4 * out_W + out_w;
+                        out_pos.y = out_nh;
+                        write_imageh(output_image, out_pos, res);
+}
--- a/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PIXEL_SHUFFLE_OP
+#include "operators/kernel/pixel_shuffle_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool PixelShuffleKernel<GPU_CL, float>::Init(PixelShuffleParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("pixel_shuffle", "pixel_shuffle_kernel.cl");
+  return true;
+}
+template <>
+void PixelShuffleKernel<GPU_CL, float>::Compute(
+    const PixelShuffleParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+  auto input_image = param.InputX()->GetCLImage();
+  auto output_image = param.Out()->GetCLImage();
+  auto upscale_factor = param.upscale_factor();
+  int input_n = param.InputX()->dims()[0];
+  int input_c = param.InputX()->dims()[1];
+  int input_h = param.InputX()->dims()[2];
+  int input_w = param.InputX()->dims()[3];
+  int output_n = param.Out()->dims()[0];
+  int output_c = param.Out()->dims()[1];
+  int output_h = param.Out()->dims()[2];
+  int output_w = param.Out()->dims()[3];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &input_n);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &input_h);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(int), &input_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &output_n);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &output_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &output_h);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &output_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &upscale_factor);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/pixel_shuffle_kernel.h
+++ b/mobile/src/operators/kernel/pixel_shuffle_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef LRN_OP
+#include <cmath>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#include "operators/math/math.h"
+#endif
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class PixelShuffleKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     PixelShuffleParam<DeviceType>> {
+ public:
+  void Compute(const PixelShuffleParam<DeviceType> &param);
+  bool Init(PixelShuffleParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -3628,5 +3628,35 @@ class EXPParam : public OpParam {
  GType *out_;
 };
 #endif
+#ifdef PIXEL_SHUFFLE_OP
+template <typename Dtype>
+class PixelShuffleParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  PixelShuffleParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    out_ = OutFrom<GType>(outputs, *scope);
+    upscale_factor_ = GetAttr<int>("upscale_factor", attrs);
+  }
+  const GType *InputX() const { return input_x_; }
+  GType *Out() const { return out_; }
+  const int &upscale_factor() const { return upscale_factor_; }
+ private:
+  GType *input_x_;
+  GType *out_;
+  int upscale_factor_;
+};
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/mobile/src/operators/pixel_shuffle_op.cpp
+++ b/mobile/src/operators/pixel_shuffle_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PIXEL_SHUFFLE_OP
+#include "operators/pixel_shuffle_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void PixelShuffleOp<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  int n = x_dims[0];
+  int c = x_dims[1];
+  int h = x_dims[2];
+  int w = x_dims[3];
+  int upscale_factor = this->param_.upscale_factor();
+  this->param_.Out()->Resize(
+      framework::make_ddim({n, c / (upscale_factor * upscale_factor),
+                            h * upscale_factor, w * upscale_factor}));
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(pixel_shuffle, ops::PixelShuffleOp);
+#endif
+#endif
--- a/mobile/src/operators/pixel_shuffle_op.h
+++ b/mobile/src/operators/pixel_shuffle_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PIXEL_SHUFFLE_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/pixel_shuffle_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class PixelShuffleOp : public framework::OperatorWithKernel<
+                           DeviceType, PixelShuffleParam<DeviceType>,
+                           operators::PixelShuffleKernel<DeviceType, T>> {
+ public:
+  PixelShuffleOp(const string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
+      : framework::OperatorWithKernel<
+            DeviceType, PixelShuffleParam<DeviceType>,
+            operators::PixelShuffleKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+  void InferShape() const override;
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -378,6 +378,7 @@ if(NOT FOUND_MATCH)
  set(RANGE_OP ON)
  set(REDUCE_PROD_OP ON)
  set(FUSION_INSTANCENORM_RELU_OP ON)
+  set(PIXEL_SHUFFLE_OP ON)
 endif()
  # option(BATCHNORM_OP "" ON)
@@ -751,3 +752,6 @@ endif()
 if (REDUCE_PROD_OP)
  add_definitions(-DREDUCE_PROD_OP)
 endif()
+if (PIXEL_SHUFFLE_OP)
+  add_definitions(-DPIXEL_SHUFFLE_OP)
+endif()