提交 a96d013c 编写于 作者: N NazgulLee 提交者: Yanzhan Yang

optimize instance norm. test=develop (#2120)

上级 4139dc2c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
#include <algorithm>
namespace paddle_mobile {
namespace operators {
void InstanceNorm(framework::CLHelper *cl_helper,
const InstanceNormParam<GPU_CL> &param) {
auto kernel = cl_helper->KernelAt(0);
auto &dims = param.Out()->dims();
const int n = dims[0];
const int c_group = (dims[1] + 3) / 4;
const int h = dims[2];
const int w = dims[3];
auto epsilon = param.Epsilon();
auto input = param.InputX()->GetCLImage();
auto out = param.Out()->GetCLImage();
// DLOG << "Epsilon: " << epsilon;
auto local_work_size_info = cl_helper->LocalWorkSizeInfo();
//
// DLOG << local_work_size_info.max_work_group_size;
// DLOG << local_work_size_info.max_work_item_size0;
// DLOG << local_work_size_info.max_work_item_size1;
// DLOG << local_work_size_info.max_work_item_size2;
int maxTotal =
std::min(static_cast<int>(local_work_size_info.max_work_group_size), 256);
int local_work_size1 =
std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
std::min(256, w));
int local_work_size2 = 1;
const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
(size_t)local_work_size2};
const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
(size_t)local_work_size2};
// DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
// << work_size[2];
// DLOG << "local_work_size" << local_work_size[0] << " " <<
// local_work_size[1]
// << " " << local_work_size[2];
cl_int status;
clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL);
}
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(INSTANCENORM_OP) || defined(FUSION_INSTANCENORM_RELU_OP)
#pragma once
#include "framework/cl/cl_helper.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void InstanceNorm(framework::CLHelper *cl_helper,
const InstanceNormParam<GPU_CL> &param);
}
} // namespace paddle_mobile
#endif
...@@ -32,13 +32,19 @@ __kernel void instancenorm(__private const int in_width, ...@@ -32,13 +32,19 @@ __kernel void instancenorm(__private const int in_width,
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef LOCAL_MEM_128
__local float4 shared_mem[128];
#elif defined(LOCAL_MEM_64)
__local float4 shared_mem[64];
#else
__local float4 shared_mem[256]; __local float4 shared_mem[256];
#endif
int xOffset = c * in_width;
int yOffset = n * in_height;
float4 sum = 0.0f; float4 sum = 0.0f;
for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
sum += read_imagef(input, sampler, (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex))); sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex));
} }
} }
shared_mem[local_id] = sum; shared_mem[local_id] = sum;
...@@ -73,7 +79,8 @@ __kernel void instancenorm(__private const int in_width, ...@@ -73,7 +79,8 @@ __kernel void instancenorm(__private const int in_width,
sum = 0.0f; sum = 0.0f;
for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
sum += pow(read_imagef(input, sampler, (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex))) - mean_val, 2); float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val;
sum += temp * temp;
} }
} }
shared_mem[local_id] = sum; shared_mem[local_id] = sum;
...@@ -107,7 +114,7 @@ __kernel void instancenorm(__private const int in_width, ...@@ -107,7 +114,7 @@ __kernel void instancenorm(__private const int in_width,
for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
int2 intout_pos = (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex)); int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
float4 in_val = read_imagef(input, sampler, intout_pos); float4 in_val = read_imagef(input, sampler, intout_pos);
half4 out_val = convert_half4((in_val - mean_val) * s); half4 out_val = convert_half4((in_val - mean_val) * s);
#ifdef RELU #ifdef RELU
......
...@@ -16,74 +16,32 @@ limitations under the License. */ ...@@ -16,74 +16,32 @@ limitations under the License. */
#include "operators/kernel/instancenorm_kernel.h" #include "operators/kernel/instancenorm_kernel.h"
#include <cmath> #include <cmath>
#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) { bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl"); auto &dims = param->Out()->dims();
const int h = dims[2];
std::string build_options = "";
if (h == 128) {
build_options = "-DLOCAL_MEM_128";
} else if (h == 64) {
build_options = "-DLOCAL_MEM_64";
} else if (h > 256) {
PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
}
this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
build_options);
return true; return true;
} }
template <> template <>
void InstanceNormKernel<GPU_CL, float>::Compute( void InstanceNormKernel<GPU_CL, float>::Compute(
const InstanceNormParam<GPU_CL> &param) { const InstanceNormParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); InstanceNorm(&this->cl_helper_, param);
auto &dims = param.Out()->dims();
const int n = dims[0];
const int c_group = (dims[1] + 3) / 4;
const int h = dims[2];
const int w = dims[3];
auto epsilon = param.Epsilon();
auto input = param.InputX()->GetCLImage();
auto out = param.Out()->GetCLImage();
DLOG << "Epsilon: " << epsilon;
auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
DLOG << local_work_size_info.max_work_group_size;
DLOG << local_work_size_info.max_work_item_size0;
DLOG << local_work_size_info.max_work_item_size1;
DLOG << local_work_size_info.max_work_item_size2;
int local_work_size1 =
std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
std::min(256, w));
int local_work_size2 = 1;
const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
(size_t)local_work_size2};
const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
(size_t)local_work_size2};
DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
<< work_size[2];
DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
<< " " << local_work_size[2];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class InstanceNormKernel<GPU_CL, float>; template class InstanceNormKernel<GPU_CL, float>;
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/instancenorm_relu_kernel.h" #include "operators/kernel/instancenorm_relu_kernel.h"
#include <cmath> #include <cmath>
#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -23,7 +24,16 @@ namespace operators { ...@@ -23,7 +24,16 @@ namespace operators {
template <> template <>
bool InstanceNormReluKernel<GPU_CL, float>::Init( bool InstanceNormReluKernel<GPU_CL, float>::Init(
InstanceNormParam<GPU_CL> *param) { InstanceNormParam<GPU_CL> *param) {
const std::string build_options = "-DRELU"; auto &dims = param->Out()->dims();
const int h = dims[2];
std::string build_options = "-DRELU";
if (h == 128) {
build_options += " -DLOCAL_MEM_128";
} else if (h == 64) {
build_options += " -DLOCAL_MEM_64";
} else if (h > 256) {
PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
}
this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl", this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
build_options); build_options);
return true; return true;
...@@ -32,59 +42,7 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init( ...@@ -32,59 +42,7 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
template <> template <>
void InstanceNormReluKernel<GPU_CL, float>::Compute( void InstanceNormReluKernel<GPU_CL, float>::Compute(
const InstanceNormParam<GPU_CL> &param) { const InstanceNormParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); InstanceNorm(&this->cl_helper_, param);
auto &dims = param.Out()->dims();
const int n = dims[0];
const int c_group = (dims[1] + 3) / 4;
const int h = dims[2];
const int w = dims[3];
auto epsilon = param.Epsilon();
auto input = param.InputX()->GetCLImage();
auto out = param.Out()->GetCLImage();
DLOG << "Epsilon: " << epsilon;
auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
DLOG << local_work_size_info.max_work_group_size;
DLOG << local_work_size_info.max_work_item_size0;
DLOG << local_work_size_info.max_work_item_size1;
DLOG << local_work_size_info.max_work_item_size2;
int local_work_size1 =
std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
std::min(256, w));
int local_work_size2 = 1;
const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
(size_t)local_work_size2};
const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
(size_t)local_work_size2};
DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
<< work_size[2];
DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
<< " " << local_work_size[2];
cl_int status;
clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL);
} }
template class InstanceNormReluKernel<GPU_CL, float>; template class InstanceNormReluKernel<GPU_CL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册