[MOBILE][OPENCL]Add elementwise_sub op; Fix instancenorm op; (#2622)

f3d113a1 · Jiaying Zhao · GitHub · d1b7aec5 · f3d113a1 · f3d113a1
15 changed file
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
    "fill_constant_batch_size_like";
 const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
 const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
+const char *G_OP_TYPE_EXPAND = "expand";
+const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";

 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -156,7 +158,7 @@ std::unordered_map<
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
        {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
@@ -258,5 +260,7 @@ std::unordered_map<
         {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
        {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
 }  // namespace paddle_mobile
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
 extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
+extern const char *G_OP_TYPE_EXPAND;
+extern const char *G_OP_TYPE_GRID_SAMPLER;

 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
 #ifdef ELEMENTWISESUB_OP
-LOAD_OP1(elementwise_sub, CPU)
+LOAD_OP2(elementwise_sub, CPU, GPU_CL)
 #endif
 #ifdef TOP_K_OP
 LOAD_OP1(top_k, CPU)
@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU);
 #ifdef PIXEL_SHUFFLE_OP
 LOAD_OP1(pixel_shuffle, GPU_CL);
 #endif
+#ifdef EXPAND_OP
+LOAD_OP1(expand, GPU_CL);
+#endif
+#ifdef GRID_SAMPLER_OP
+LOAD_OP1(grid_sampler, GPU_CL);
+#endif
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ b/mobile/src/operators/elementwise_sub_op.cpp
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionInstanceNormReluOp
    : public framework::OperatorWithKernel<
-          DeviceType, InstanceNormParam<DeviceType>,
+          DeviceType, FusionInstanceNormReluParam<DeviceType>,
          operators::InstanceNormReluKernel<DeviceType, T>> {
 public:
  FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp
                           const framework::AttributeMap &attrs,
                           framework::Scope *scope)
      : framework::OperatorWithKernel<
-            DeviceType, InstanceNormParam<DeviceType>,
+            DeviceType, FusionInstanceNormReluParam<DeviceType>,
            operators::InstanceNormReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}


--- a/mobile/src/operators/instancenorm_op.cpp
+++ b/mobile/src/operators/instancenorm_op.cpp
@@ -24,7 +24,7 @@ namespace operators {
 template <typename Dtype, typename T>
 void InstanceNormOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.OutputY()->Resize(x_dims);
 }

 }  // namespace operators

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
@@ -17,17 +17,17 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param) {
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon) {
  auto kernel = cl_helper->KernelAt(0);

-  auto &dims = param.Out()->dims();
+  auto &dims = output->dims();
  const int n = dims[0];
  const int c_group = (dims[1] + 3) / 4;
  const int h = dims[2];
  const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
+  auto input_image = input->GetCLImage();
+  auto out_image = output->GetCLImage();

  //      DLOG << "Epsilon: " << epsilon;

@@ -66,9 +66,9 @@ void InstanceNorm(framework::CLHelper *cl_helper,
  CL_CHECK_ERRORS(status);
  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
+  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
  CL_CHECK_ERRORS(status);
  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
                         work_size, local_work_size, 0, NULL, NULL);

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
@@ -21,7 +21,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param);
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon);
 }
 }  // namespace paddle_mobile
 #endif
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 input = read_imageh(inputImage, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = input - biase;
+     write_imageh(outputImage, coords, output);
+ }
--- a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/kernel/elementwise_sub_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseSubKernel<GPU_CL, float>::Init(
+    ElementwiseSubParam<GPU_CL> *param) {
+  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
+      const_cast<framework::CLImage *>(param->InputY()));
+  if (bias->dims().size() == 4) {
+    if (!bias->isInit()) {
+      bias->InitNormalCLImage(cl_helper_.CLContext(),
+                              this->cl_helper_.CLCommandQueue());
+    }
+    DLOG << " bias: " << *bias;
+    this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+  return true;
+}
+
+template <>
+void ElementwiseSubKernel<GPU_CL, float>::Compute(
+    const ElementwiseSubParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims().size() == 4) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+}
+
+template class ElementwiseSubKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {

 template <>
 bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  auto &dims = param->Out()->dims();
+  auto &dims = param->OutputY()->dims();
  const int h = dims[2];
  std::string build_options = "";
  if (h == 128) {
@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
 template <>
 void InstanceNormKernel<GPU_CL, float>::Compute(
    const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
+               param.Epsilon());
 }

 template class InstanceNormKernel<GPU_CL, float>;

--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {

 template <>
 bool InstanceNormReluKernel<GPU_CL, float>::Init(
-    InstanceNormParam<GPU_CL> *param) {
+    FusionInstanceNormReluParam<GPU_CL> *param) {
  auto &dims = param->Out()->dims();
  const int h = dims[2];
  std::string build_options = "-DRELU";
@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(

 template <>
 void InstanceNormReluKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+    const FusionInstanceNormReluParam<GPU_CL> &param) {
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
 }

 template class InstanceNormReluKernel<GPU_CL, float>;

--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
@@ -30,10 +30,10 @@ using framework::OpKernelBase;

 template <typename DeviceType, typename T>
 class InstanceNormReluKernel
-    : public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
+    : public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
 public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
+  void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
+  bool Init(FusionInstanceNormReluParam<DeviceType> *param);
 };

 }  // namespace operators

--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -927,6 +927,35 @@ class InstanceNormParam : public OpParam {
                    Scope *scope)
      : OpParam(inputs, outputs, attrs, scope) {
    input_x_ = InputXFrom<GType>(inputs, *scope);
+    output_y_ = OutputYFrom<GType>(outputs, *scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *OutputY() const { return output_y_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+ private:
+  GType *input_x_;
+  GType *output_y_;
+  float epsilon_;
+};
+#endif
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+template <typename Dtype>
+class FusionInstanceNormReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionInstanceNormReluParam(const VariableNameMap &inputs,
+                              const VariableNameMap &outputs,
+                              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
    out_ = OutFrom<GType>(outputs, *scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
  }
@@ -3658,5 +3687,56 @@ class PixelShuffleParam : public OpParam {
 };
 #endif

+#ifdef EXPAND_OP
+template <typename Dtype>
+class ExpandParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    out_ = OutFrom<GType>(outputs, *scope);
+    expand_times_ = GetAttr<std::vector<int>>("expand_times", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *Out() const { return out_; }
+
+ private:
+  GType *input_x_;
+  GType *out_;
+  std::vector<int> expand_times_;
+};
+#endif
+
+#ifdef GRID_SAMPLER_OP
+template <typename Dtype>
+class GridSamplerParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  GridSamplerParam(const VariableNameMap &inputs,
+                   const VariableNameMap &outputs, const AttributeMap &attrs,
+                   Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    output_ = OutputFrom<GType>(outputs, *scope);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *Output() const { return output_; }
+
+ private:
+  GType *input_x_;
+  GType *output_;
+};
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -379,6 +379,8 @@ if(NOT FOUND_MATCH)
  set(REDUCE_PROD_OP ON)
  set(FUSION_INSTANCENORM_RELU_OP ON)
  set(PIXEL_SHUFFLE_OP ON)
+  set(EXPAND_OP ON)
+  set(GRID_SAMPLER_OP ON)
 endif()

  # option(BATCHNORM_OP "" ON)
@@ -755,3 +757,10 @@ endif()
 if (PIXEL_SHUFFLE_OP)
  add_definitions(-DPIXEL_SHUFFLE_OP)
 endif()
+if (EXPAND_OP)
+  add_definitions(-DEXPAND_OP)
+endif()
+if (GRID_SAMPLER_OP)
+  add_definitions(-DGRID_SAMPLER_OP)
+endif()
+