提交 84f34a23 编写于 作者: J Jiaying Zhao 提交者: GitHub

Merge pull request #1064 from smilejames/opencl

add opencl kernel of batchnorm, pool, fetch
......@@ -21,12 +21,67 @@ namespace operators {
template <>
bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl");
const framework::CLImage *mean = param->InputMean();
const framework::CLImage *variance = param->InputVariance();
const framework::CLImage *scale = param->InputScale();
const framework::CLImage *bias = param->InputBias();
const float epsilon = param->Epsilon();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
const int C = mean->numel();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
float *new_scale_ptr = new float[C];
float *new_bias_ptr = new float[C];
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
}
delete[](new_scale_ptr);
delete[](new_bias_ptr);
framework::CLImage *new_scale = new framework::CLImage();
framework::CLImage *new_bias = new framework::CLImage();
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
return true;
}
template <>
void BatchNormKernel<GPU_CL, float>::Compute(
const BatchNormParam<GPU_CL> &param) {}
const BatchNormParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY());
auto input = param.InputX()->GetCLImage();
auto out = param.OutputY()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
const int out_height = param.OutputY()->HeightOfOneBlock();
const int out_width = param.OutputY()->WidthOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &out_height);
clSetKernelArg(kernel, 1, sizeof(int), &out_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
template class BatchNormKernel<GPU_CL, float>;
......
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void batchnorm(__private const int out_height,
__private const int out_width,
__read_only image2d_t input,
__read_only image2d_t new_scale,
__read_only image2d_t new_bias,
__write_only image2d_t output) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 new_scale = read_imageh(bn_scale, sampler, (int2)(out_c, 0));
half4 new_bias = read_imageh(bn_bias, sampler, (int2)(out_c, 0));
int pos_x = mad24(out_c, out_width, out_w);
half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
half4 out = mad(in, new_scale, new_bias);
write_imageh(output, (int2)(pos_x, nh), out);
}
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void fetch(__private const int in_height,
__private const int in_width,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__read_only image2d_t input,
__global float* out) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_float(in.x);
out[index + size_ch] = convert_float(in.y);
out[index + size_ch * 2] = convert_float(in.z);
out[index + size_ch * 3] = convert_float(in.w);
}
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN_VALUE -FLT_MAX
__kernel void pool_max(
__private const int in_height, __private const int in_width,
__private const int out_height, __private const int out_width,
__private const int pad_top, __private const int pad_left,
__private const int stride_h, __private const int stride_w,
__private const int ksize_h, __private const int ksize_w,
__read_only image2d_t input, __write_only image2d_t output) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int start_h = max(out_h * stride_h - pad_top, 0);
int end_h = min(start_h + ksize_h, in_height);
int start_w = max(out_w * stride_w - pad_left, 0);
int end_w = min(start_w + ksize_w, in_width);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
half4 max_value = (half4)(MIN_VALUE);
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
max_value = max(max_value, tmp);
}
}
const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
}
__kernel void pool_avg(
__private const int in_height, __private const int in_width,
__private const int out_height, __private const int out_width,
__private const int pad_top, __private const int pad_left,
__private const int stride_h, __private const int stride_w,
__private const int ksize_h, __private const int ksize_w,
__read_only image2d_t input, __write_only image2d_t output) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int start_h = max(out_h * stride_h - pad_top, 0);
int end_h = min(start_h + ksize_h, in_height);
int start_w = max(out_w * stride_w - pad_left, 0);
int end_w = min(start_w + ksize_w, in_width);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
half4 sum = (half4)(0.0f);
int num = 0;
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
num++;
}
}
half4 avg = sum / num;
const int pos_out_x = mad24(out_c, out_width, out_w);
write_imageh(output, (int2)(pos_out_x, out_nh), avg);
}
\ No newline at end of file
......@@ -19,11 +19,45 @@ namespace operators {
template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
return true;
}
template <>
void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {}
void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
auto input = param.InputX()->GetCLImage();
auto *out = param.Out();
const auto &dims = param.InputX()->dims();
const int N = dims[0];
const int C = dims[1];
const int in_height = dims[2];
const int in_width = dims[3];
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
// need create outputBuffer
cl_image_format imageFormat;
imageFormat.image_channel_order = CL_RGBA;
imageFormat.image_channel_data_type = CL_FLOAT;
cl_mem outputBuffer;
clSetKernelArg(kernel, 0, sizeof(int), &in_height);
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
clSetKernelArg(kernel, 3, sizeof(int), &size_block);
clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
template class FetchKernel<GPU_CL, float>;
......
......@@ -21,11 +21,51 @@ namespace operators {
template <>
bool PoolKernel<GPU_CL, float>::Init(PoolParam<GPU_CL> *param) {
std::string pooling_type = param->PoolingType();
this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl");
return true;
}
template <>
void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {}
void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
auto input = param.Input()->GetCLImage();
auto out = param.Output()->GetCLImage();
const int in_height = param.Input()->HeightOfOneBlock();
const int in_width = param.Input()->WidthOfOneBlock();
const int out_height = param.Output()->HeightOfOneBlock();
const int out_width = param.Output()->WidthOfOneBlock();
std::string pooling_type = param.PoolingType();
std::vector<int> ksize = param.Ksize();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
const int pad_top = paddings[0];
const int pad_left = paddings[1];
const int stride_h = strides[0];
const int stride_w = strides[1];
const int ksize_h = ksize[0];
const int ksize_w = ksize[1];
clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
template class PoolKernel<GPU_CL, float>;
......
......@@ -614,6 +614,14 @@ class BatchNormParam : OpParam {
const string &DataFormat() const { return data_format_; }
void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
const RType *NewScale() const { return new_scale_; }
const RType *NewBias() const { return new_bias_; }
private:
RType *input_x_;
RType *output_y_;
......@@ -625,6 +633,8 @@ class BatchNormParam : OpParam {
float momentum_;
bool is_test_;
string data_format_;
RType *new_bias_;
RType *new_scale_;
};
#endif
......@@ -936,14 +946,18 @@ class FetchParam : public OpParam {
FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
out_ = OutFrom(outputs, scope);
}
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
Tensor *Out() const { return out_; }
static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
return GetVarValue<Tensor>("Out", outputs, scope);
}
private:
RType *input_x_;
RType *out_;
Tensor *out_;
};
#ifdef TRANSPOSE_OP
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册