fusion instancenorm and relu. test=develop (#1986)

eabee4dc · NazgulLee · Yanzhan Yang · 92eeabeb · eabee4dc · eabee4dc
8 changed file
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
@@ -132,6 +132,7 @@ const char *G_OP_TYPE_WHILE = "while";
 const char *G_OP_TYPE_BEAM_SEARCH_DECODE = "beam_search_decode";
 const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
    "fill_constant_batch_size_like";
+const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";

 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -155,6 +156,7 @@ std::unordered_map<
        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},

--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
@@ -257,8 +257,7 @@ extern const char *G_OP_TYPE_PAD2D;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
-
-extern const char *G_OP_TYPE_PAD2D;
+extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;

 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/mobile/src/operators/fusion_instancenorm_relu_op.cpp
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+
+#include "operators/fusion_instancenorm_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionInstanceNormReluOp<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_instancenorm_relu,
+                        ops::FusionInstanceNormReluMatcher);
+
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_instancenorm_relu, ops::FusionInstanceNormReluOp);
+#endif
+
+#endif
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/instancenorm_relu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionInstanceNormReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_INSTANCENORM);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_INSTANCENORM_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionInstanceNormReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, InstanceNormParam<DeviceType>,
+          operators::InstanceNormReluKernel<DeviceType, T>> {
+ public:
+  FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const framework::AttributeMap &attrs,
+                           framework::Scope *scope)
+      : framework::OperatorWithKernel<
+            DeviceType, InstanceNormParam<DeviceType>,
+            operators::InstanceNormReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#include "cl_common.h"

 __kernel void instancenorm(__private const int in_width,
                        __private const int in_height,
@@ -109,7 +109,11 @@ __kernel void instancenorm(__private const int in_width,
    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
      int2 intout_pos = (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex));
      float4 in_val = read_imagef(input, sampler, intout_pos);
-      write_imageh(output, intout_pos, convert_half4((in_val - mean_val) * s));
+      half4 out_val = convert_half4((in_val - mean_val) * s);
+#ifdef RELU
+      out_val = activation(out_val);
+#endif
+      write_imageh(output, intout_pos, out_val);
    }
  }
 }
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+
+#include "operators/kernel/instancenorm_relu_kernel.h"
+#include <cmath>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool InstanceNormReluKernel<GPU_CL, float>::Init(
+    InstanceNormParam<GPU_CL> *param) {
+  const std::string build_options = "-DRELU";
+  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
+                             build_options);
+  return true;
+}
+
+template <>
+void InstanceNormReluKernel<GPU_CL, float>::Compute(
+    const InstanceNormParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto &dims = param.Out()->dims();
+
+  const int n = dims[0];
+  const int c_group = (dims[1] + 3) / 4;
+  const int h = dims[2];
+  const int w = dims[3];
+  auto epsilon = param.Epsilon();
+  auto input = param.InputX()->GetCLImage();
+  auto out = param.Out()->GetCLImage();
+
+  DLOG << "Epsilon: " << epsilon;
+
+  auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
+
+  DLOG << local_work_size_info.max_work_group_size;
+  DLOG << local_work_size_info.max_work_item_size0;
+  DLOG << local_work_size_info.max_work_item_size1;
+  DLOG << local_work_size_info.max_work_item_size2;
+
+  const int max_work_group_size =
+      std::min(256, static_cast<int>(local_work_size_info.max_work_group_size));
+  int local_work_size1 = 1;
+  int local_work_size2 = 1;
+  for (int i = 1; i <= local_work_size_info.max_work_item_size1 && i <= w;
+       i++) {
+    for (int j = 1; j <= local_work_size_info.max_work_item_size2 && j <= h;
+         j++) {
+      if (i * j <= max_work_group_size) {
+        if (i * j > local_work_size1 * local_work_size2) {
+          local_work_size1 = i;
+          local_work_size2 = j;
+        }
+      }
+    }
+  }
+  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
+                               (size_t)local_work_size2};
+  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
+                                     (size_t)local_work_size2};
+
+  DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
+       << work_size[2];
+  DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
+       << " " << local_work_size[2];
+
+  cl_int status;
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
+  CL_CHECK_ERRORS(status);
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         work_size, local_work_size, 0, NULL, NULL);
+}
+
+template class InstanceNormReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+
+#include <vector>
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class InstanceNormReluKernel
+    : public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
+ public:
+  void Compute(const InstanceNormParam<DeviceType> &param);
+  bool Init(InstanceNormParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -377,6 +377,7 @@ if(NOT FOUND_MATCH)
  set(FILL_CONSTANT_BATCH_SIZE_LIKE_OP ON)
  set(RANGE_OP ON)
  set(REDUCE_PROD_OP ON)
+  set(FUSION_INSTANCENORM_RELU_OP ON)
 endif()

  # option(BATCHNORM_OP "" ON)
@@ -413,6 +414,9 @@ endif()
 if (INSTANCENORM_OP)
  add_definitions(-DINSTANCENORM_OP)
 endif()
+if (FUSION_INSTANCENORM_RELU_OP)
+  add_definitions(-DFUSION_INSTANCENORM_RELU_OP)
+endif()
 if (BOXCODER_OP)
  add_definitions(-DBOXCODER_OP)
 endif()