diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp index a096fae81d0e3d2b03ee582e85f49c1b84627ae2..8770ce70191197790c4e0b1dfbd4523ef83e5d4c 100644 --- a/src/operators/kernel/cl/batchnorm_kernel.cpp +++ b/src/operators/kernel/cl/batchnorm_kernel.cpp @@ -21,12 +21,67 @@ namespace operators { template <> bool BatchNormKernel::Init(BatchNormParam *param) { + this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl"); + const framework::CLImage *mean = param->InputMean(); + const framework::CLImage *variance = param->InputVariance(); + const framework::CLImage *scale = param->InputScale(); + const framework::CLImage *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + const int C = mean->numel(); + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + float *new_scale_ptr = new float[C]; + float *new_bias_ptr = new float[C]; + + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + delete[](new_scale_ptr); + delete[](new_bias_ptr); + + framework::CLImage *new_scale = new framework::CLImage(); + framework::CLImage *new_bias = new framework::CLImage(); + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + return true; } template <> void BatchNormKernel::Compute( - const BatchNormParam ¶m) {} + const BatchNormParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY()); + + auto input = param.InputX()->GetCLImage(); + auto out = param.OutputY()->GetCLImage(); + auto new_scale = param.NewScale()->GetCLImage(); + auto new_bias = param.NewBias()->GetCLImage(); + const int out_height = param.OutputY()->HeightOfOneBlock(); + const int out_width = param.OutputY()->WidthOfOneBlock(); + + clSetKernelArg(kernel, 0, sizeof(int), &out_height); + clSetKernelArg(kernel, 1, sizeof(int), &out_width); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &out); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} template class BatchNormKernel; diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..d2cc2151422255f48f81550f7424ec2dccb3be41 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl @@ -0,0 +1,24 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void batchnorm(__private const int out_height, + __private const int out_width, + __read_only image2d_t input, + __read_only image2d_t new_scale, + __read_only image2d_t new_bias, + __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + half4 new_scale = read_imageh(bn_scale, sampler, (int2)(out_c, 0)); + half4 new_bias = read_imageh(bn_bias, sampler, (int2)(out_c, 0)); + + int pos_x = mad24(out_c, out_width, out_w); + half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh)); + half4 out = mad(in, new_scale, new_bias); + + write_imageh(output, (int2)(pos_x, nh), out); +} diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..10f39f9cf9549a6c1a5abe2af905f94f7355220e --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl @@ -0,0 +1,27 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void fetch(__private const int in_height, + __private const int in_width, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __read_only image2d_t input, + __global float* out) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh)); + + const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = convert_float(in.x); + out[index + size_ch] = convert_float(in.y); + out[index + size_ch * 2] = convert_float(in.z); + out[index + size_ch * 3] = convert_float(in.w); +} diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..18246fddcfb803adeae5cc9e2efeba1a4362aa2e --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl @@ -0,0 +1,75 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define MIN_VALUE -FLT_MAX + +__kernel void pool_max( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 max_value = (half4)(MIN_VALUE); + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + max_value = max(max_value, tmp); + } + } + + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), max_value); +} + +__kernel void pool_avg( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 sum = (half4)(0.0f); + int num = 0; + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + num++; + } + } + half4 avg = sum / num; + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), avg); +} \ No newline at end of file diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp index d10bfe7a4bd64c8eb0aaa6ae85f531d3d3dce169..995713ce5afaf0a93bc6b8ddd9928d7cee1c55ff 100644 --- a/src/operators/kernel/cl/fetch_kernel.cpp +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -19,11 +19,45 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); return true; } template <> -void FetchKernel::Compute(const FetchParam ¶m) {} +void FetchKernel::Compute(const FetchParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); + + auto input = param.InputX()->GetCLImage(); + auto *out = param.Out(); + + const auto &dims = param.InputX()->dims(); + const int N = dims[0]; + const int C = dims[1]; + const int in_height = dims[2]; + const int in_width = dims[3]; + + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; + + // need create outputBuffer + cl_image_format imageFormat; + imageFormat.image_channel_order = CL_RGBA; + imageFormat.image_channel_data_type = CL_FLOAT; + cl_mem outputBuffer; + + clSetKernelArg(kernel, 0, sizeof(int), &in_height); + clSetKernelArg(kernel, 1, sizeof(int), &in_width); + clSetKernelArg(kernel, 2, sizeof(int), &size_ch); + clSetKernelArg(kernel, 3, sizeof(int), &size_block); + clSetKernelArg(kernel, 4, sizeof(int), &size_batch); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} template class FetchKernel; diff --git a/src/operators/kernel/cl/pool_kernel.cpp b/src/operators/kernel/cl/pool_kernel.cpp index c24a1babf106afe07e3b3dd30727ed1419af5bf8..802de26e6147aa0bf5d9467c6c6cab0f0148fe59 100644 --- a/src/operators/kernel/cl/pool_kernel.cpp +++ b/src/operators/kernel/cl/pool_kernel.cpp @@ -21,11 +21,51 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { + std::string pooling_type = param->PoolingType(); + this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl"); return true; } template <> -void PoolKernel::Compute(const PoolParam ¶m) {} +void PoolKernel::Compute(const PoolParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + + auto input = param.Input()->GetCLImage(); + auto out = param.Output()->GetCLImage(); + + const int in_height = param.Input()->HeightOfOneBlock(); + const int in_width = param.Input()->WidthOfOneBlock(); + const int out_height = param.Output()->HeightOfOneBlock(); + const int out_width = param.Output()->WidthOfOneBlock(); + + std::string pooling_type = param.PoolingType(); + std::vector ksize = param.Ksize(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + const int pad_top = paddings[0]; + const int pad_left = paddings[1]; + const int stride_h = strides[0]; + const int stride_w = strides[1]; + const int ksize_h = ksize[0]; + const int ksize_w = ksize[1]; + + clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height); + clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width); + clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height); + clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width); + clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top); + clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left); + clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h); + clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w); + clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h); + clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w); + clSetKernelArg(kernel, 10, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 11, sizeof(cl_mem), &out); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} template class PoolKernel; diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 1a1f910d11885fd31f3ff3454ea5782b0ced4eb6..bd80bc8805a857d858c2d4c0d14c417677c2880f 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -614,6 +614,14 @@ class BatchNormParam : OpParam { const string &DataFormat() const { return data_format_; } + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + private: RType *input_x_; RType *output_y_; @@ -625,6 +633,8 @@ class BatchNormParam : OpParam { float momentum_; bool is_test_; string data_format_; + RType *new_bias_; + RType *new_scale_; }; #endif @@ -936,14 +946,18 @@ class FetchParam : public OpParam { FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + out_ = OutFrom(outputs, scope); } const RType *InputX() const { return input_x_; } - RType *Out() const { return out_; } + Tensor *Out() const { return out_; } + + static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) { + return GetVarValue("Out", outputs, scope); + } private: RType *input_x_; - RType *out_; + Tensor *out_; }; #ifdef TRANSPOSE_OP