optimize instance norm. test=develop (#2120)

a96d013c · NazgulLee · Yanzhan Yang · 4139dc2c · a96d013c · a96d013c
5 changed file
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
+#include <algorithm>
+namespace paddle_mobile {
+namespace operators {
+void InstanceNorm(framework::CLHelper *cl_helper,
+                  const InstanceNormParam<GPU_CL> &param) {
+  auto kernel = cl_helper->KernelAt(0);
+  auto &dims = param.Out()->dims();
+  const int n = dims[0];
+  const int c_group = (dims[1] + 3) / 4;
+  const int h = dims[2];
+  const int w = dims[3];
+  auto epsilon = param.Epsilon();
+  auto input = param.InputX()->GetCLImage();
+  auto out = param.Out()->GetCLImage();
+  //      DLOG << "Epsilon: " << epsilon;
+  auto local_work_size_info = cl_helper->LocalWorkSizeInfo();
+  //
+  //      DLOG << local_work_size_info.max_work_group_size;
+  //      DLOG << local_work_size_info.max_work_item_size0;
+  //      DLOG << local_work_size_info.max_work_item_size1;
+  //      DLOG << local_work_size_info.max_work_item_size2;
+  int maxTotal =
+      std::min(static_cast<int>(local_work_size_info.max_work_group_size), 256);
+  int local_work_size1 =
+      std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
+               std::min(256, w));
+  int local_work_size2 = 1;
+  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
+                               (size_t)local_work_size2};
+  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
+                                     (size_t)local_work_size2};
+  //      DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
+  //           << work_size[2];
+  //      DLOG << "local_work_size" << local_work_size[0] << " " <<
+  //      local_work_size[1]
+  //           << " " << local_work_size[2];
+  cl_int status;
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
+  CL_CHECK_ERRORS(status);
+  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
+                         work_size, local_work_size, 0, NULL, NULL);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#if defined(INSTANCENORM_OP) || defined(FUSION_INSTANCENORM_RELU_OP)
+#pragma once
+#include "framework/cl/cl_helper.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void InstanceNorm(framework::CLHelper *cl_helper,
+                  const InstanceNormParam<GPU_CL> &param);
+}
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
@@ -32,13 +32,19 @@ __kernel void instancenorm(__private const int in_width,
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef LOCAL_MEM_128
+  __local float4 shared_mem[128];
+#elif defined(LOCAL_MEM_64)
+  __local float4 shared_mem[64];
+#else
  __local float4 shared_mem[256];
+#endif
+  int xOffset = c * in_width;
+  int yOffset = n * in_height;
  float4 sum = 0.0f;
  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      sum += read_imagef(input, sampler, (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex)));
+      sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex));
    }
  }
  shared_mem[local_id] = sum;
@@ -73,7 +79,8 @@ __kernel void instancenorm(__private const int in_width,
  sum = 0.0f;
  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      sum += pow(read_imagef(input, sampler, (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex))) - mean_val, 2);
+      float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val;
+      sum += temp * temp;
    }
  }
  shared_mem[local_id] = sum;
@@ -107,7 +114,7 @@ __kernel void instancenorm(__private const int in_width,
  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      int2 intout_pos = (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex));
+      int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
      float4 in_val = read_imagef(input, sampler, intout_pos);
      half4 out_val = convert_half4((in_val - mean_val) * s);
 #ifdef RELU

--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -16,74 +16,32 @@ limitations under the License. */
 #include "operators/kernel/instancenorm_kernel.h"
 #include <cmath>
+#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
 bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl");
+  auto &dims = param->Out()->dims();
+  const int h = dims[2];
+  std::string build_options = "";
+  if (h == 128) {
+    build_options = "-DLOCAL_MEM_128";
+  } else if (h == 64) {
+    build_options = "-DLOCAL_MEM_64";
+  } else if (h > 256) {
+    PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
+  }
+  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
+                             build_options);
  return true;
 }
 template <>
 void InstanceNormKernel<GPU_CL, float>::Compute(
    const InstanceNormParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
+  InstanceNorm(&this->cl_helper_, param);
-  auto &dims = param.Out()->dims();
-  const int n = dims[0];
-  const int c_group = (dims[1] + 3) / 4;
-  const int h = dims[2];
-  const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
-  DLOG << "Epsilon: " << epsilon;
-  auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
-  DLOG << local_work_size_info.max_work_group_size;
-  DLOG << local_work_size_info.max_work_item_size0;
-  DLOG << local_work_size_info.max_work_item_size1;
-  DLOG << local_work_size_info.max_work_item_size2;
-  int local_work_size1 =
-      std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
-               std::min(256, w));
-  int local_work_size2 = 1;
-  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
-                               (size_t)local_work_size2};
-  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
-                                     (size_t)local_work_size2};
-  DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
-       << work_size[2];
-  DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
-       << " " << local_work_size[2];
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             work_size, local_work_size, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
 }
 template class InstanceNormKernel<GPU_CL, float>;

--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "operators/kernel/instancenorm_relu_kernel.h"
 #include <cmath>
+#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -23,7 +24,16 @@ namespace operators {
 template <>
 bool InstanceNormReluKernel<GPU_CL, float>::Init(
    InstanceNormParam<GPU_CL> *param) {
-  const std::string build_options = "-DRELU";
+  auto &dims = param->Out()->dims();
+  const int h = dims[2];
+  std::string build_options = "-DRELU";
+  if (h == 128) {
+    build_options += " -DLOCAL_MEM_128";
+  } else if (h == 64) {
+    build_options += " -DLOCAL_MEM_64";
+  } else if (h > 256) {
+    PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
+  }
  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
                             build_options);
  return true;
@@ -32,59 +42,7 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
 template <>
 void InstanceNormReluKernel<GPU_CL, float>::Compute(
    const InstanceNormParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
+  InstanceNorm(&this->cl_helper_, param);
-  auto &dims = param.Out()->dims();
-  const int n = dims[0];
-  const int c_group = (dims[1] + 3) / 4;
-  const int h = dims[2];
-  const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
-  DLOG << "Epsilon: " << epsilon;
-  auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
-  DLOG << local_work_size_info.max_work_group_size;
-  DLOG << local_work_size_info.max_work_item_size0;
-  DLOG << local_work_size_info.max_work_item_size1;
-  DLOG << local_work_size_info.max_work_item_size2;
-  int local_work_size1 =
-      std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
-               std::min(256, w));
-  int local_work_size2 = 1;
-  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
-                               (size_t)local_work_size2};
-  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
-                                     (size_t)local_work_size2};
-  DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
-       << work_size[2];
-  DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
-       << " " << local_work_size[2];
-  cl_int status;
-  clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                         work_size, local_work_size, 0, NULL, NULL);
 }
 template class InstanceNormReluKernel<GPU_CL, float>;